diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 36c95852452ac..7820fbda803d7 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -41,10 +41,12 @@ def _parse_ninja_log(ninja_log: list[str]) -> list[tuple[str, str]]:
         # touch test/4.stamp
         #
         # index will point to the line that starts with Failed:. The progress
-        # indicator is the line before this ([4/5] test/4.stamp) and contains a pretty
-        # printed version of the target being built (test/4.stamp). We use this line
-        # and remove the progress information to get a succinct name for the target.
-        failing_action = ninja_log[index - 1].split("] ")[1]
+        # indicator is sometimes the line before this ([4/5] test/4.stamp) and
+        # will contain a pretty printed version of the target being built
+        # (test/4.stamp) when accurate. We instead parse the failed line rather
+        # than the progress indicator as the progress indicator may not be
+        # aligned with the failure.
+        failing_action = ninja_log[index].split("FAILED: ")[1]
         failure_log = []
         while (
             index < len(ninja_log)
diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py
index 431e10da6405a..4068a3b7300a4 100644
--- a/.ci/generate_test_report_lib_test.py
+++ b/.ci/generate_test_report_lib_test.py
@@ -39,7 +39,7 @@ def test_find_failure_ninja_logs(self):
         self.assertEqual(
             failures[0],
             (
-                "test/4.stamp",
+                "touch test/4.stamp",
                 dedent(
                     """\
                     FAILED: touch test/4.stamp
@@ -77,7 +77,7 @@ def test_ninja_log_end(self):
         self.assertEqual(
             failures[0],
             (
-                "test/3.stamp",
+                "touch test/3.stamp",
                 dedent(
                     """\
                     FAILED: touch test/3.stamp
@@ -106,7 +106,7 @@ def test_ninja_log_multiple_failures(self):
         self.assertEqual(
             failures[0],
             (
-                "test/2.stamp",
+                "touch test/2.stamp",
                 dedent(
                     """\
                     FAILED: touch test/2.stamp
@@ -117,7 +117,7 @@ def test_ninja_log_multiple_failures(self):
         self.assertEqual(
             failures[1],
             (
-                "test/4.stamp",
+                "touch test/4.stamp",
                 dedent(
                     """\
                     FAILED: touch test/4.stamp
@@ -150,7 +150,7 @@ def test_ninja_log_runtimes_failure(self):
         self.assertEqual(
             failures[0],
             (
-                "test/2.stamp",
+                "touch test/2.stamp",
                 dedent(
                     """\
                     FAILED: touch test/2.stamp
@@ -159,6 +159,34 @@ def test_ninja_log_runtimes_failure(self):
             ),
         )
 
+    # Test that we correctly handle cases where the FAILED: line does not
+    # match up with the progress indicator.
+    def test_ninja_log_mismatched_failed(self):
+        failures = generate_test_report_lib.find_failure_in_ninja_logs(
+            [
+                [
+                    "[1/5] test/1.stamp",
+                    "[2/5] test/2.stamp",
+                    "ModuleNotFoundError: No module named 'mount_langley'",
+                    "FAILED: tools/check-langley",
+                    "Wow! This system is really broken!",
+                    "[5/5] test/5.stamp",
+                ]
+            ]
+        )
+        self.assertEqual(len(failures), 1)
+        self.assertEqual(
+            failures[0],
+            (
+                "tools/check-langley",
+                dedent(
+                    """\
+                    FAILED: tools/check-langley
+                    Wow! This system is really broken!"""
+                ),
+            ),
+        )
+
     def test_title_only(self):
         self.assertEqual(
             generate_test_report_lib.generate_report("Foo", 0, [], []),
@@ -448,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
                     All tests passed but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
-                    <summary>test/2.stamp</summary>
+                    <summary>touch test/2.stamp</summary>
 
                     ```
                     FAILED: touch test/2.stamp
@@ -456,7 +484,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
                     ```
                     </details>
                     <details>
-                    <summary>test/4.stamp</summary>
+                    <summary>touch test/4.stamp</summary>
 
                     ```
                     FAILED: touch test/4.stamp
diff --git a/.github/workflows/build-ci-container-tooling.yml b/.github/workflows/build-ci-container-tooling.yml
index c77c78617666d..992947eb2fffb 100644
--- a/.github/workflows/build-ci-container-tooling.yml
+++ b/.github/workflows/build-ci-container-tooling.yml
@@ -63,7 +63,7 @@ jobs:
           podman save ${{ steps.vars.outputs.container-name-lint-tag }}  >  ${{ steps.vars.outputs.container-lint-filename }}
 
       - name: Upload container image
-        uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: container-amd64
           path: "*.tar"
diff --git a/.github/workflows/check-ci.yml b/.github/workflows/check-ci.yml
index f18a69c192ee9..6ecad5536109b 100644
--- a/.github/workflows/check-ci.yml
+++ b/.github/workflows/check-ci.yml
@@ -28,7 +28,7 @@ jobs:
       - name: Setup Python
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: 3.13
+          python-version: 3.14
           cache: 'pip'
       - name: Install Python Dependencies
         run: |
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b5f3413fe3b6b..7374777cb759c 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -97,7 +97,7 @@ jobs:
       - name: Setup Python env
         uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
         with:
-          python-version: '3.13'
+          python-version: '3.14'
           cache: 'pip'
           cache-dependency-path: 'llvm/docs/requirements-hashed.txt'
       - name: Install python dependencies
diff --git a/.github/workflows/gha-codeql.yml b/.github/workflows/gha-codeql.yml
index 63388ebc706bd..6d490ca2c4b29 100644
--- a/.github/workflows/gha-codeql.yml
+++ b/.github/workflows/gha-codeql.yml
@@ -29,9 +29,9 @@ jobs:
           sparse-checkout: |
             .github/
       - name: Initialize CodeQL
-        uses: github/codeql-action/init@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
+        uses: github/codeql-action/init@5d5cd550d3e189c569da8f16ea8de2d821c9bf7a # v3.31.2
         with:
           languages: actions
           queries: security-extended
       - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@303c0aef88fc2fe5ff6d63d3b1596bfd83dfa1f9 # v3.30.4
+        uses: github/codeql-action/analyze@5d5cd550d3e189c569da8f16ea8de2d821c9bf7a # v3.31.2
diff --git a/.github/workflows/hlsl-test-all.yaml b/.github/workflows/hlsl-test-all.yaml
index dcb852312d41a..ce6ccfa23df6a 100644
--- a/.github/workflows/hlsl-test-all.yaml
+++ b/.github/workflows/hlsl-test-all.yaml
@@ -54,7 +54,7 @@ jobs:
           path: golden-images
       - name: Setup Windows
         if: runner.os == 'Windows'
-        uses: llvm/actions/setup-windows@main
+        uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main
         with:
           arch: amd64
       - name: Build DXC
@@ -80,7 +80,7 @@ jobs:
             ninja check-hlsl-unit
             ninja ${{ inputs.TestTarget }}
       - name: Publish Test Results
-        uses: EnricoMi/publish-unit-test-result-action/macos@3a74b2957438d0b6e2e61d67b05318aa25c9e6c6 # v2.20.0
+        uses: EnricoMi/publish-unit-test-result-action/macos@34d7c956a59aed1bfebf31df77b8de55db9bbaaf # v2.21.0
         if: always() && runner.os == 'macOS'
         with:
           comment_mode: off
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 5ccf976848197..432c45744abda 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -100,7 +100,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
       - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
+        uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       - name: Install abi-compliance-checker
         run: |
           sudo apt-get update
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
index 312cb47fc3d93..4bce86145fc0c 100644
--- a/.github/workflows/libcxx-build-containers.yml
+++ b/.github/workflows/libcxx-build-containers.yml
@@ -55,7 +55,7 @@ jobs:
         TAG: ${{ github.sha }}
 
     - name: Log in to GitHub Container Registry
-      uses: docker/login-action@184bdaa0721073962dff0199f1fb9940f07167d1 # v3.5.0
+      uses: docker/login-action@5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
       with:
         registry: ghcr.io
         username: ${{ github.actor }}
diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml
index 9e8f55859fc7a..e2ca940d2f0b3 100644
--- a/.github/workflows/libcxx-run-benchmarks.yml
+++ b/.github/workflows/libcxx-run-benchmarks.yml
@@ -35,7 +35,7 @@ jobs:
     steps:
       - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
-          python-version: '3.13'
+          python-version: '3.14'
 
       - name: Extract information from the PR
         id: vars
diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml
index f73d180bb0005..961f1cc79389d 100644
--- a/.github/workflows/llvm-abi-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -88,7 +88,7 @@ jobs:
             repo: ${{ github.repository }}
     steps:
       - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
+        uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       - name: Install abi-compliance-checker
         run: |
           sudo apt-get update
diff --git a/.github/workflows/llvm-bugs.yml b/.github/workflows/llvm-bugs.yml
index 7d42abfadde7b..3274f1adf9e61 100644
--- a/.github/workflows/llvm-bugs.yml
+++ b/.github/workflows/llvm-bugs.yml
@@ -39,6 +39,12 @@ jobs:
               repo: context.repo.repo
             })
             .then((issue) => {
+              var maybeTruncatedBody = issue.data.body;
+              if (maybeTruncatedBody.length > 15000) {
+                maybeTruncatedBody = maybeTruncatedBody.substring(0,
+                    15000) +
+                  "<truncated>Please see the issue for the entire body."
+              }
               const payload = {
                 author : issue.data.user.login,
                 issue  : issue.data.number,
@@ -46,7 +52,7 @@ jobs:
                 url    : issue.data.html_url,
                 labels : issue.data.labels.map((label) => label.name),
                 assignee : issue.data.assignees.map((assignee) => assignee.login),
-                body   : issue.data.body
+                body   : maybeTruncatedBody
               };
 
               const data = {
diff --git a/.github/workflows/new-issues.yml b/.github/workflows/new-issues.yml
index 8480a657cc717..a5dcad28dbe24 100644
--- a/.github/workflows/new-issues.yml
+++ b/.github/workflows/new-issues.yml
@@ -13,7 +13,7 @@ jobs:
     runs-on: ubuntu-24.04
     if: github.repository == 'llvm/llvm-project'
     steps:
-      - uses: llvm/actions/issue-labeler@main
+      - uses: llvm/actions/issue-labeler@42d80571b13f4599bbefbc7189728b64723c7f78 # main
         with:
           repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
           configuration-path: .github/new-issues-labeler.yml
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 6303a119750b5..973d3abf358ce 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -190,7 +190,7 @@ jobs:
         with:
           max-size: "2000M"
       - name: Install Ninja
-        uses: llvm/actions/install-ninja@main
+        uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       - name: Build and Test
         run: |
           source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py)
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index fa73b9d9fe8d0..25f426b7814df 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -68,7 +68,7 @@ jobs:
     # due to https://github.com/actions/runner-images/issues/10385
     - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
       with:
-        python-version: '3.13'
+        python-version: '3.14'
 
     - name: Checkout LLVM
       uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
@@ -138,7 +138,6 @@ jobs:
           target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
         fi
 
-        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
         case "${{ inputs.runs-on }}" in
           ubuntu-22.04*)
             build_runs_on="depot-${{ inputs.runs-on }}-16"
@@ -157,6 +156,23 @@ jobs:
             build_runs_on=$test_runs_on
             ;;
         esac
+
+        case "$build_runs_on" in
+          # These runners cannot build the full release package faster than
+          # the 6 hours timeout limit, so we need to use a configuration
+          # that builds more quickly.
+          macos-14)
+            bootstrap_prefix="BOOTSTRAP"
+            target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF -DLLVM_RELEASE_ENABLE_PGO=OFF"
+            ;;
+          *)
+            bootstrap_prefix="BOOTSTRAP_BOOTSTRAP"
+            ;;
+        esac
+
+        target_cmake_flags="$target_cmake_flags -D${bootstrap_prefix}_CPACK_PACKAGE_FILE_NAME=$release_binary_basename"
+
+        echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
         echo "build-runs-on=$build_runs_on" >> $GITHUB_OUTPUT
         echo "test-runs-on=$test_runs_on" >> $GITHUB_OUTPUT
 
@@ -173,11 +189,11 @@ jobs:
         ref: ${{ needs.prepare.outputs.ref }}
 
     - name: Install Ninja
-      uses: llvm/actions/install-ninja@a1ea791b03c8e61f53a0e66f2f73db283aa0f01e # main
+      uses: llvm/actions/install-ninja@42d80571b13f4599bbefbc7189728b64723c7f78 # main
 
     - name: Setup Windows
       if: startsWith(runner.os, 'Windows')
-      uses: llvm/actions/setup-windows@main
+      uses: llvm/actions/setup-windows@42d80571b13f4599bbefbc7189728b64723c7f78 # main
       with:
         arch: amd64
 
@@ -200,8 +216,7 @@ jobs:
         # so we need to set some extra cmake flags to disable this.
         cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \
             ${{ needs.prepare.outputs.target-cmake-flags }} \
-            -C clang/cmake/caches/Release.cmake \
-            -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}"
+            -C clang/cmake/caches/Release.cmake
 
     - name: Build
       shell: bash
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index c07df338cf989..bd3277a8b452c 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -36,7 +36,7 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@05b42c624433fc40578a4040d5cf5e36ddca8cde # v2.4.2
+        uses: ossf/scorecard-action@4eaacf0543bb3f2c246792bd56e8cdeffafb205a # v2.4.3
         with:
           results_file: results.sarif
           results_format: sarif
diff --git a/bolt/README.md b/bolt/README.md
index 902d1eb6e7694..55f742c5019f5 100644
--- a/bolt/README.md
+++ b/bolt/README.md
@@ -173,7 +173,7 @@ Once you have `perf.fdata` ready, you can use it for optimizations with
 BOLT. Assuming your environment is setup to include the right path, execute
 `llvm-bolt`:
 ```
-$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=hfsort -split-functions -split-all-cold -split-eh -dyno-stats
+$ llvm-bolt <executable> -o <executable>.bolt -data=perf.fdata -reorder-blocks=ext-tsp -reorder-functions=cdsort -split-functions -split-all-cold -split-eh -dyno-stats
 ```
 
 If you do need an updated debug info, then add `-update-debug-sections` option
diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 43ceceee7de45..7c6e01d669b74 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -381,11 +381,6 @@
 
   Set verbosity level for diagnostic output
 
-- `--write-dwp`
-
-  Output a single dwarf package file (dwp) instead of multiple non-relocatable
-  dwarf object files (dwo).
-
 ### BOLT optimization options:
 
 - `--align-blocks`
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index d666c10885ad5..5e349cd69fb43 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -840,6 +840,16 @@ class MCPlusBuilder {
     return false;
   }
 
+  virtual bool isLDRWl(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
+  virtual bool isLDRXl(const MCInst &Inst) const {
+    llvm_unreachable("not implemented");
+    return false;
+  }
+
   virtual bool isMOVW(const MCInst &Inst) const {
     llvm_unreachable("not implemented");
     return false;
@@ -1789,6 +1799,19 @@ class MCPlusBuilder {
     llvm_unreachable("not implemented");
   }
 
+  /// Take \p LDRInst and return ADRP+LDR instruction sequence - for
+  ///
+  ///     ldr  x0, [label]
+  ///
+  /// the following sequence will be generated:
+  ///
+  ///     adrp x0, PageBase(label)
+  ///     ldr  x0, [x0, PageOffset(label)]
+  virtual InstructionListType createAdrpLdr(const MCInst &LDRInst,
+                                            MCContext *Ctx) const {
+    llvm_unreachable("not implemented");
+  }
+
   /// Return not 0 if the instruction CurInst, in combination with the recent
   /// history of disassembled instructions supplied by [Begin, End), is a linker
   /// generated veneer/stub that needs patching. This happens in AArch64 when
diff --git a/bolt/include/bolt/Passes/ADRRelaxationPass.h b/bolt/include/bolt/Passes/AArch64RelaxationPass.h
similarity index 51%
rename from bolt/include/bolt/Passes/ADRRelaxationPass.h
rename to bolt/include/bolt/Passes/AArch64RelaxationPass.h
index b9f92dec7f03b..b9185a1e34388 100644
--- a/bolt/include/bolt/Passes/ADRRelaxationPass.h
+++ b/bolt/include/bolt/Passes/AArch64RelaxationPass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===//
+//===- bolt/Passes/AArch64RelaxationPass.h ----------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,29 +6,29 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the ADRRelaxationPass class, which replaces AArch64
-// non-local ADR instructions with ADRP + ADD due to small offset range of ADR
-// instruction (+- 1MB) which could be easily overflowed after BOLT
-// optimizations. Such problems are usually connected with errata 843419
-// https://developer.arm.com/documentation/epm048406/2100/
+// This file declares the AArch64RelaxationPass class, which replaces AArch64
+// non-local ADR/LDR instructions with ADRP + ADD/LDR due to small offset
+// range of ADR and LDR instruction (+- 1MB) which could be easily overflowed
+// after BOLT optimizations. Such problems are usually connected with errata
+// 843419: https://developer.arm.com/documentation/epm048406/2100/
 // The linker could replace ADRP instruction with ADR in some cases.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef BOLT_PASSES_ADRRELAXATIONPASS_H
-#define BOLT_PASSES_ADRRELAXATIONPASS_H
+#ifndef BOLT_PASSES_AARCH64RELAXATIONPASS_H
+#define BOLT_PASSES_AARCH64RELAXATIONPASS_H
 
 #include "bolt/Passes/BinaryPasses.h"
 
 namespace llvm {
 namespace bolt {
 
-class ADRRelaxationPass : public BinaryFunctionPass {
+class AArch64RelaxationPass : public BinaryFunctionPass {
 public:
-  explicit ADRRelaxationPass(const cl::opt<bool> &PrintPass)
+  explicit AArch64RelaxationPass(const cl::opt<bool> &PrintPass)
       : BinaryFunctionPass(PrintPass) {}
 
-  const char *getName() const override { return "adr-relaxation"; }
+  const char *getName() const override { return "aarch64-relaxation"; }
 
   /// Pass entry point
   Error runOnFunctions(BinaryContext &BC) override;
diff --git a/bolt/include/bolt/Passes/FixRelaxationPass.h b/bolt/include/bolt/Passes/FixRelaxationPass.h
index 50b64480aa62e..cf5a8a1fcb134 100644
--- a/bolt/include/bolt/Passes/FixRelaxationPass.h
+++ b/bolt/include/bolt/Passes/FixRelaxationPass.h
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.h --------------------------*- C++ -*-===//
+//===- bolt/Passes/FixRelaxationPass.h --------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index a383ced1712e3..7af32c8c56635 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -78,6 +78,11 @@ cl::opt<std::string> CompDirOverride(
              "to *.dwo files."),
     cl::Hidden, cl::init(""), cl::cat(BoltCategory));
 
+static cl::opt<bool> CloneConstantIsland("clone-constant-island",
+                                         cl::desc("clone constant islands"),
+                                         cl::Hidden, cl::init(true),
+                                         cl::ZeroOrMore, cl::cat(BoltCategory));
+
 static cl::opt<bool>
     FailOnInvalidPadding("fail-on-invalid-padding", cl::Hidden, cl::init(false),
                          cl::desc("treat invalid code padding as error"),
@@ -461,7 +466,8 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
       // of dynamic relocs, as we currently do not support cloning them.
       // Notice: we might fail to link because of this, if the original constant
       // island we are referring would be emitted too far away.
-      if (IslandIter->second->hasDynamicRelocationAtIsland()) {
+      if (IslandIter->second->hasDynamicRelocationAtIsland() ||
+          !opts::CloneConstantIsland) {
         MCSymbol *IslandSym =
             IslandIter->second->getOrCreateIslandAccess(Address);
         if (IslandSym)
@@ -469,6 +475,12 @@ BinaryContext::handleAddressRef(uint64_t Address, BinaryFunction &BF,
       } else if (MCSymbol *IslandSym =
                      IslandIter->second->getOrCreateProxyIslandAccess(Address,
                                                                       BF)) {
+        LLVM_DEBUG(
+            dbgs() << "BOLT-DEBUG: clone constant island at address 0x"
+                   << Twine::utohexstr(IslandIter->first) << " with size of 0x"
+                   << Twine::utohexstr(
+                          IslandIter->second->estimateConstantIslandSize())
+                   << " bytes, referenced by " << BF << "\n");
         BF.createIslandDependency(IslandSym, IslandIter->second);
         return std::make_pair(IslandSym, 0);
       }
@@ -778,13 +790,17 @@ void BinaryContext::populateJumpTables() {
   }
 
   if (opts::StrictMode && DataPCRelocations.size()) {
-    LLVM_DEBUG({
-      dbgs() << DataPCRelocations.size()
-             << " unclaimed PC-relative relocations left in data:\n";
-      for (uint64_t Reloc : DataPCRelocations)
-        dbgs() << Twine::utohexstr(Reloc) << '\n';
-    });
-    assert(0 && "unclaimed PC-relative relocations left in data\n");
+    this->errs() << "BOLT-ERROR: " << DataPCRelocations.size()
+                 << " unclaimed PC-relative relocation(s) left in data";
+    if (opts::Verbosity) {
+      this->errs() << ":\n";
+      for (uint64_t RelocOffset : DataPCRelocations)
+        this->errs() << "  @0x" << Twine::utohexstr(RelocOffset) << '\n';
+    } else {
+      this->errs() << ". Re-run with -v=1 to see the list\n";
+    }
+    this->errs() << "BOLT-ERROR: unable to proceed with --strict\n";
+    exit(1);
   }
   clearList(DataPCRelocations);
 }
diff --git a/bolt/lib/Passes/ADRRelaxationPass.cpp b/bolt/lib/Passes/AArch64RelaxationPass.cpp
similarity index 67%
rename from bolt/lib/Passes/ADRRelaxationPass.cpp
rename to bolt/lib/Passes/AArch64RelaxationPass.cpp
index c3954c94a7f92..610adad58cfcb 100644
--- a/bolt/lib/Passes/ADRRelaxationPass.cpp
+++ b/bolt/lib/Passes/AArch64RelaxationPass.cpp
@@ -1,4 +1,4 @@
-//===- bolt/Passes/ADRRelaxationPass.cpp ----------------------------------===//
+//===- bolt/Passes/AArch64RelaxationPass.cpp ------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the ADRRelaxationPass class.
+// This file implements the AArch64RelaxationPass class.
 //
 //===----------------------------------------------------------------------===//
 
-#include "bolt/Passes/ADRRelaxationPass.h"
+#include "bolt/Passes/AArch64RelaxationPass.h"
 #include "bolt/Core/ParallelUtilities.h"
 #include "bolt/Utils/CommandLineOpts.h"
 #include <iterator>
@@ -20,10 +20,10 @@ using namespace llvm;
 namespace opts {
 extern cl::OptionCategory BoltCategory;
 
-static cl::opt<bool>
-    AdrPassOpt("adr-relaxation",
-               cl::desc("Replace ARM non-local ADR instructions with ADRP"),
-               cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden);
+static cl::opt<bool> AArch64PassOpt(
+    "aarch64-relaxation",
+    cl::desc("Replace ARM non-local ADR/LDR instructions with ADRP"),
+    cl::init(true), cl::cat(BoltCategory), cl::ReallyHidden);
 } // namespace opts
 
 namespace llvm {
@@ -35,7 +35,7 @@ namespace bolt {
 // jobs and checking the exit flag after it.
 static bool PassFailed = false;
 
-void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
+void AArch64RelaxationPass::runOnFunction(BinaryFunction &BF) {
   if (PassFailed)
     return;
 
@@ -43,10 +43,13 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
   for (BinaryBasicBlock &BB : BF) {
     for (auto It = BB.begin(); It != BB.end(); ++It) {
       MCInst &Inst = *It;
-      if (!BC.MIB->isADR(Inst))
+      bool IsADR = BC.MIB->isADR(Inst);
+
+      // TODO: Handle other types of LDR (literal, PC-relative) instructions.
+      if (!IsADR && !BC.MIB->isLDRXl(Inst) && !BC.MIB->isLDRWl(Inst))
         continue;
 
-      const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst);
+      const MCSymbol *Symbol = BC.MIB->getTargetSymbol(Inst, IsADR ? 0 : 1);
       if (!Symbol)
         continue;
 
@@ -56,25 +59,27 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
           continue;
       }
 
-      // Don't relax ADR if it points to the same function and is in the main
-      // fragment and BF initial size is < 1MB.
+      // Don't relax ADR/LDR if it points to the same function and is in the
+      // main fragment and BF initial size is < 1MB.
       const unsigned OneMB = 0x100000;
       if (BF.getSize() < OneMB) {
         BinaryFunction *TargetBF = BC.getFunctionForSymbol(Symbol);
         if (TargetBF == &BF && !BB.isSplit())
           continue;
 
-        // No relaxation needed if ADR references a basic block in the same
+        // No relaxation needed if ADR/LDR references a basic block in the same
         // fragment.
         if (BinaryBasicBlock *TargetBB = BF.getBasicBlockForLabel(Symbol))
           if (BB.getFragmentNum() == TargetBB->getFragmentNum())
             continue;
       }
 
-      InstructionListType AdrpAdd;
+      InstructionListType AdrpMaterialization;
       {
         auto L = BC.scopeLock();
-        AdrpAdd = BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get());
+        AdrpMaterialization =
+            IsADR ? BC.MIB->undoAdrpAddRelaxation(Inst, BC.Ctx.get())
+                  : BC.MIB->createAdrpLdr(Inst, BC.Ctx.get());
       }
 
       if (It != BB.begin() && BC.MIB->isNoop(*std::prev(It))) {
@@ -88,18 +93,18 @@ void ADRRelaxationPass::runOnFunction(BinaryFunction &BF) {
         // invalidate this offset, so we have to rely on linker-inserted NOP to
         // replace it with ADRP, and abort if it is not present.
         auto L = BC.scopeLock();
-        BC.errs() << "BOLT-ERROR: cannot relax ADR in non-simple function "
-                  << BF << '\n';
+        BC.errs() << "BOLT-ERROR: cannot relax " << (IsADR ? "ADR" : "LDR")
+                  << " in non-simple function " << BF << '\n';
         PassFailed = true;
         return;
       }
-      It = BB.replaceInstruction(It, AdrpAdd);
+      It = BB.replaceInstruction(It, AdrpMaterialization);
     }
   }
 }
 
-Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
-  if (!opts::AdrPassOpt || !BC.HasRelocations)
+Error AArch64RelaxationPass::runOnFunctions(BinaryContext &BC) {
+  if (!opts::AArch64PassOpt || !BC.HasRelocations)
     return Error::success();
 
   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
@@ -108,7 +113,7 @@ Error ADRRelaxationPass::runOnFunctions(BinaryContext &BC) {
 
   ParallelUtilities::runOnEachFunction(
       BC, ParallelUtilities::SchedulingPolicy::SP_TRIVIAL, WorkFun, nullptr,
-      "ADRRelaxationPass");
+      "AArch64RelaxationPass");
 
   if (PassFailed)
     return createFatalBOLTError("");
diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt
index d7519518f186f..3197e62faad21 100644
--- a/bolt/lib/Passes/CMakeLists.txt
+++ b/bolt/lib/Passes/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_llvm_library(LLVMBOLTPasses
-  ADRRelaxationPass.cpp
+  AArch64RelaxationPass.cpp
   Aligner.cpp
   AllocCombiner.cpp
   AsmDump.cpp
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 4e062038a3e4c..8554683bc3cf8 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -564,13 +564,18 @@ void DataAggregator::imputeFallThroughs() {
     // Skip fall-throughs in external code.
     if (Trace.From == Trace::EXTERNAL)
       continue;
-    std::pair CurrentBranch(Trace.Branch, Trace.From);
+    if (std::pair CurrentBranch(Trace.Branch, Trace.From);
+        CurrentBranch != PrevBranch) {
+      // New group: reset aggregates.
+      AggregateCount = AggregateFallthroughSize = 0;
+      PrevBranch = CurrentBranch;
+    }
     // BR_ONLY must be the last trace in the group
     if (Trace.To == Trace::BR_ONLY) {
       // If the group is not empty, use aggregate values, otherwise 0-length
       // for unconditional jumps (call/ret/uncond branch) or 1-length for others
       uint64_t InferredBytes =
-          PrevBranch == CurrentBranch
+          AggregateFallthroughSize
               ? AggregateFallthroughSize / AggregateCount
               : !checkUnconditionalControlTransfer(Trace.From);
       Trace.To = Trace.From + InferredBytes;
@@ -578,16 +583,11 @@ void DataAggregator::imputeFallThroughs() {
                         << " bytes)\n");
       ++InferredTraces;
     } else {
-      // Trace with a valid fall-through
-      // New group: reset aggregates.
-      if (CurrentBranch != PrevBranch)
-        AggregateCount = AggregateFallthroughSize = 0;
       // Only use valid fall-through lengths
       if (Trace.To != Trace::EXTERNAL)
         AggregateFallthroughSize += (Trace.To - Trace.From) * Info.TakenCount;
       AggregateCount += Info.TakenCount;
     }
-    PrevBranch = CurrentBranch;
   }
   if (opts::Verbosity >= 1)
     outs() << "BOLT-INFO: imputed " << InferredTraces << " traces\n";
diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp
index 782137e807662..1a0f6d75d63e8 100644
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "bolt/Rewrite/BinaryPassManager.h"
-#include "bolt/Passes/ADRRelaxationPass.h"
+#include "bolt/Passes/AArch64RelaxationPass.h"
 #include "bolt/Passes/Aligner.h"
 #include "bolt/Passes/AllocCombiner.h"
 #include "bolt/Passes/AsmDump.h"
@@ -129,10 +129,10 @@ static cl::opt<bool> PrintJTFootprintReduction(
     cl::desc("print function after jt-footprint-reduction pass"), cl::Hidden,
     cl::cat(BoltOptCategory));
 
-static cl::opt<bool>
-    PrintAdrRelaxation("print-adr-relaxation",
-                       cl::desc("print functions after ADR Relaxation pass"),
-                       cl::Hidden, cl::cat(BoltOptCategory));
+static cl::opt<bool> PrintAArch64Relaxation(
+    "print-adr-ldr-relaxation",
+    cl::desc("print functions after ADR/LDR Relaxation pass"), cl::Hidden,
+    cl::cat(BoltOptCategory));
 
 static cl::opt<bool>
     PrintLongJmp("print-longjmp",
@@ -517,7 +517,7 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) {
 
   if (BC.isAArch64()) {
     Manager.registerPass(
-        std::make_unique<ADRRelaxationPass>(PrintAdrRelaxation));
+        std::make_unique<AArch64RelaxationPass>(PrintAArch64Relaxation));
 
     // Tighten branches according to offset differences between branch and
     // targets. No extra instructions after this pass, otherwise we may have
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 7769162d67eaf..57db6a436c5c6 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -142,6 +142,7 @@ static InstructionListType createIncMemory(MCPhysReg RegTo, MCPhysReg RegTmp) {
   atomicAdd(Insts.back(), RegTo, RegTmp);
   return Insts;
 }
+
 class AArch64MCPlusBuilder : public MCPlusBuilder {
 public:
   using MCPlusBuilder::MCPlusBuilder;
@@ -583,6 +584,14 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return Inst.getOpcode() == AArch64::ADDXri;
   }
 
+  bool isLDRWl(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::LDRWl;
+  }
+
+  bool isLDRXl(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::LDRXl;
+  }
+
   MCPhysReg getADRReg(const MCInst &Inst) const {
     assert((isADR(Inst) || isADRP(Inst)) && "Not an ADR instruction");
     assert(MCPlus::getNumPrimeOperands(Inst) != 0 &&
@@ -602,6 +611,39 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     return materializeAddress(Target, Ctx, Reg, Addend);
   }
 
+  InstructionListType createAdrpLdr(const MCInst &LDRInst,
+                                    MCContext *Ctx) const override {
+    assert((isLDRXl(LDRInst) || isLDRWl(LDRInst)) &&
+           "LDR (literal, 32 or 64-bit integer load) instruction expected");
+    assert(LDRInst.getOperand(0).isReg() &&
+           "unexpected operand in LDR instruction");
+    const MCPhysReg DataReg = LDRInst.getOperand(0).getReg();
+    const MCPhysReg AddrReg =
+        isLDRXl(LDRInst) ? DataReg
+                         : (MCPhysReg)RegInfo->getMatchingSuperReg(
+                               DataReg, AArch64::sub_32,
+                               &RegInfo->getRegClass(AArch64::GPR64RegClassID));
+    const MCSymbol *Target = getTargetSymbol(LDRInst, 1);
+    assert(Target && "missing target symbol in LDR instruction");
+
+    InstructionListType Insts(2);
+    Insts[0].setOpcode(AArch64::ADRP);
+    Insts[0].clear();
+    Insts[0].addOperand(MCOperand::createReg(AddrReg));
+    Insts[0].addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(Insts[0], /* OpNum */ 1, Target, 0, Ctx,
+                          ELF::R_AARCH64_NONE);
+    Insts[1].setOpcode(isLDRXl(LDRInst) ? AArch64::LDRXui : AArch64::LDRWui);
+    Insts[1].clear();
+    Insts[1].addOperand(MCOperand::createReg(DataReg));
+    Insts[1].addOperand(MCOperand::createReg(AddrReg));
+    Insts[1].addOperand(MCOperand::createImm(0));
+    Insts[1].addOperand(MCOperand::createImm(0));
+    setOperandToSymbolRef(Insts[1], /* OpNum */ 2, Target, 0, Ctx,
+                          ELF::R_AARCH64_ADD_ABS_LO12_NC);
+    return Insts;
+  }
+
   bool isTB(const MCInst &Inst) const {
     return (Inst.getOpcode() == AArch64::TBNZW ||
             Inst.getOpcode() == AArch64::TBNZX ||
@@ -2762,7 +2804,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
     BitVector WrittenRegs(RegInfo->getNumRegs());
     const BitVector &SizeRegAliases = getAliases(SizeReg);
 
-    for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
+    for (auto InstIt = CallInst; InstIt != BB.begin(); --InstIt) {
       const MCInst &Inst = *InstIt;
       WrittenRegs.reset();
       getWrittenRegs(Inst, WrittenRegs);
diff --git a/bolt/test/AArch64/ldr-relaxation.s b/bolt/test/AArch64/ldr-relaxation.s
new file mode 100644
index 0000000000000..7632504a01635
--- /dev/null
+++ b/bolt/test/AArch64/ldr-relaxation.s
@@ -0,0 +1,122 @@
+## Check that LDR relaxation will fail since LDR is inside a non-simple
+## function and there is no NOP next to it.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym FAIL=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: not llvm-bolt %t.so -o %t.bolt 2>&1 | FileCheck %s --check-prefix=FAIL
+
+# FAIL: BOLT-ERROR: cannot relax LDR in non-simple function _start
+
+.ifdef FAIL
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  br x2
+  ldr x0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is not needed since the reference is not far away.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym NOT_NEEDED=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=NOT_NEEDED
+
+# NOT_NEEDED: <_start>
+# NOT_NEEDED-NEXT: ldr
+
+.ifdef NOT_NEEDED
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr x0, _start
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is done in a simple function, where NOP will
+## be inserted as needed.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_SIMPLE=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX
+
+# RELAX: adrp
+# RELAX-NEXT: ldr
+
+.ifdef RELAX_SIMPLE
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr x0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check that LDR relaxation is done in a non-simple function, where NOP
+## exists next to LDR.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_NON_SIMPLE=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAX
+
+.ifdef RELAX_NON_SIMPLE
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  br x2
+  ldr x0, _foo
+  nop
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+## Check LDR relaxation works on loading W (low 32-bit of X) registers.
+
+# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown \
+# RUN:    --defsym RELAX_SIMPLE_WREG=1 %s -o %t.o
+# RUN: %clang %cflags %t.o -o %t.so -Wl,-q
+# RUN: llvm-bolt %t.so -o %t.bolt
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=RELAXW
+
+# RELAXW: adrp x0
+# RELAXW-NEXT: ldr w0
+
+.ifdef RELAX_SIMPLE_WREG
+  .text
+  .global _start
+  .type _start, %function
+_start:
+  .cfi_startproc
+  ldr w0, _foo
+  ret
+  .cfi_endproc
+.size _start, .-_start
+.endif
+
+  .section .text_cold
+  .global _foo
+  .align 3
+_foo:
+  .long 0x12345678
+.size _foo, .-_foo
diff --git a/bolt/test/X86/callcont-fallthru.s b/bolt/test/X86/callcont-fallthru.s
index 8c05491e7bca0..ef0bb55df1faf 100644
--- a/bolt/test/X86/callcont-fallthru.s
+++ b/bolt/test/X86/callcont-fallthru.s
@@ -15,6 +15,8 @@
 # External return to a landing pad/entry point call continuation
 # RUN: link_fdata %s %t %t.pa-eret PREAGG-ERET
 # RUN-DISABLED: link_fdata %s %t %t.pa-plt PREAGG-PLT
+## Fall-through imputing test cases
+# RUN: link_fdata %s %t %t.pa-imp PREAGG-IMP
 
 # RUN: llvm-strip --strip-unneeded %t -o %t.strip
 # RUN: llvm-objcopy --remove-section=.eh_frame %t.strip %t.noeh
@@ -63,6 +65,11 @@
 # RUN-DISABLED:   --check-prefix=CHECK-PLT
 # CHECK-PLT: traces mismatching disassembled function contents: 0
 
+## Check --impute-trace-fall-throughs accepting duplicate branch-only traces
+# RUN: perf2bolt %t --pa -p %t.pa-imp -o %t.pa-imp.fdata --impute-trace-fall-through
+# RUN: FileCheck %s --check-prefix=CHECK-IMP --input-file %t.pa-imp.fdata
+# CHECK-IMP: 0 [unknown] 0 1 main {{.*}} 0 3
+
   .globl foo
   .type foo, %function
 foo:
@@ -102,6 +109,8 @@ Ltmp1:
 
 Ltmp4:
 	cmpl	$0x0, -0x14(%rbp)
+# PREAGG-IMP: B X:0 #Ltmp4_br# 1 0
+# PREAGG-IMP: B X:0 #Ltmp4_br# 2 0
 Ltmp4_br:
 	je	Ltmp0
 
diff --git a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
index 673e86bb1533a..a08e352d605fe 100644
--- a/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
+++ b/bolt/test/X86/dwarf4-ftypes-dwp-input-dwp-output.test
@@ -1,4 +1,4 @@
-# UNSUPPORTED: true
+# REQUIRES: system-linux
 ; RUN: rm -rf %t
 ; RUN: mkdir %t
 ; RUN: cd %t
@@ -8,7 +8,8 @@
 ; RUN: llvm-dwp -e main.exe -o main.exe.dwp
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.dwp | FileCheck -check-prefix=PRE-BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.dwp | FileCheck -check-prefix=PRE-BOLT-DWP-TU-INDEX %s
-; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections --write-dwp
+; RUN: llvm-bolt main.exe -o main.exe.bolt --update-debug-sections
+; RUN: llvm-dwp -e main.exe.bolt -o main.exe.bolt.dwp
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-types main.exe.bolt.dwp | FileCheck -check-prefix=BOLT %s
 ; RUN: llvm-dwarfdump --show-form --verbose --debug-tu-index main.exe.bolt.dwp | FileCheck -check-prefix=BOLT-DWP-TU-INDEX %s
 
diff --git a/bolt/test/X86/unclaimed-pc-rel.s b/bolt/test/X86/unclaimed-pc-rel.s
new file mode 100644
index 0000000000000..5292cccba754d
--- /dev/null
+++ b/bolt/test/X86/unclaimed-pc-rel.s
@@ -0,0 +1,24 @@
+## Check that unclaimed PC-relative relocation from data to code is detected
+## and reported to the user.
+
+# REQUIRES: system-linux
+
+# RUN: %clang %cflags -no-pie %s -o %t.exe -Wl,-q -nostartfiles
+# RUN: not llvm-bolt %t.exe -o %t.bolt --strict 2>&1 | FileCheck %s
+
+# CHECK: BOLT-ERROR: 1 unclaimed PC-relative relocation(s) left in data
+
+  .text
+  .globl _start
+  .type _start, %function
+_start:
+  movl $42, %eax
+.L0:
+  ret
+  .size _start, .-_start
+
+## Force relocation mode.
+  .reloc 0, R_X86_64_NONE
+
+  .section .rodata
+  .long .L0-.
diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s
index dc59a08b889a7..badff299603a0 100644
--- a/bolt/test/runtime/AArch64/inline-memcpy.s
+++ b/bolt/test/runtime/AArch64/inline-memcpy.s
@@ -7,7 +7,7 @@
 # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE
 # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM
 
-# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls)
+# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 17 total calls)
 # CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls
 
 # Each function should use optimal size-specific instructions and NO memcpy calls
@@ -84,6 +84,9 @@
 # CHECK-ASM-LABEL: <test_register_move_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
 
+# CHECK-ASM-LABEL: <test_x2_rewrite_negative>:
+# CHECK-ASM: bl{{.*}}<memcpy
+
 # Live-in parameter should NOT be inlined (size unknown at compile time)
 # CHECK-ASM-LABEL: <test_live_in_negative>:
 # CHECK-ASM: bl{{.*}}<memcpy
@@ -273,6 +276,15 @@ test_register_move_negative:
 	ret
 	.size	test_register_move_negative, .-test_register_move_negative
 
+	.globl  test_x2_rewrite_negative
+	.type   test_x2_rewrite_negative,@function
+test_x2_rewrite_negative:
+	mov     x2, #8
+	ldr     x2, [sp, #24]
+	bl      memcpy
+	ret
+	.size   test_x2_rewrite_negative, .-test_x2_rewrite_negative
+
 	.globl	test_live_in_negative
 	.type	test_live_in_negative,@function
 test_live_in_negative:
diff --git a/clang-tools-extra/Maintainers.txt b/clang-tools-extra/Maintainers.rst
similarity index 91%
rename from clang-tools-extra/Maintainers.txt
rename to clang-tools-extra/Maintainers.rst
index 43dfd48ad1f57..2603ebadf529c 100644
--- a/clang-tools-extra/Maintainers.txt
+++ b/clang-tools-extra/Maintainers.rst
@@ -2,9 +2,13 @@
 Clang Tools Extra Maintainers
 =============================
 
-This file is a list of the maintainers
-(https://llvm.org/docs/DeveloperPolicy.html#maintainers) for clang-tools-extra.
+This file is a list of the 
+`maintainers <https://llvm.org/docs/DeveloperPolicy.html#maintainers>`_
+for `Extra Clang Tools <https://clang.llvm.org/extra/index.html>`_ project.
 
+.. contents::
+   :depth: 2
+   :local:
 
 Active Maintainers
 ==================
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 2e21a4c4fd1f9..611717a64b768 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -73,6 +73,7 @@
 #include "SizeofExpressionCheck.h"
 #include "SpuriouslyWakeUpFunctionsCheck.h"
 #include "StandaloneEmptyCheck.h"
+#include "StdNamespaceModificationCheck.h"
 #include "StringConstructorCheck.h"
 #include "StringIntegerAssignmentCheck.h"
 #include "StringLiteralWithEmbeddedNulCheck.h"
@@ -237,6 +238,8 @@ class BugproneModule : public ClangTidyModule {
         "bugprone-spuriously-wake-up-functions");
     CheckFactories.registerCheck<StandaloneEmptyCheck>(
         "bugprone-standalone-empty");
+    CheckFactories.registerCheck<StdNamespaceModificationCheck>(
+        "bugprone-std-namespace-modification");
     CheckFactories.registerCheck<StringConstructorCheck>(
         "bugprone-string-constructor");
     CheckFactories.registerCheck<StringIntegerAssignmentCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 31a0e6906866a..6c96996458040 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -75,6 +75,7 @@ add_clang_library(clangTidyBugproneModule STATIC
   SmartPtrArrayMismatchCheck.cpp
   SpuriouslyWakeUpFunctionsCheck.cpp
   StandaloneEmptyCheck.cpp
+  StdNamespaceModificationCheck.cpp
   StringConstructorCheck.cpp
   StringIntegerAssignmentCheck.cpp
   StringLiteralWithEmbeddedNulCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
similarity index 95%
rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
index 79fbc66b5f8a3..13e5c03d7c4d3 100644
--- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DontModifyStdNamespaceCheck.h"
+#include "StdNamespaceModificationCheck.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchersInternal.h"
 
@@ -36,9 +36,9 @@ AST_POLYMORPHIC_MATCHER_P(
 
 } // namespace
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
-void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) {
+void StdNamespaceModificationCheck::registerMatchers(MatchFinder *Finder) {
   auto HasStdParent =
       hasDeclContext(namespaceDecl(hasAnyName("std", "posix"),
                                    unless(hasParent(namespaceDecl())))
@@ -96,7 +96,7 @@ void DontModifyStdNamespaceCheck::registerMatchers(MatchFinder *Finder) {
                          .bind("decl"),
                      this);
 }
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
 static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) {
   const NamespaceDecl *LastNS = nullptr;
@@ -108,7 +108,7 @@ static const NamespaceDecl *getTopLevelLexicalNamespaceDecl(const Decl *D) {
   return LastNS;
 }
 
-void clang::tidy::cert::DontModifyStdNamespaceCheck::check(
+void clang::tidy::bugprone::StdNamespaceModificationCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *D = Result.Nodes.getNodeAs<Decl>("decl");
   const auto *NS = Result.Nodes.getNodeAs<NamespaceDecl>("nmspc");
diff --git a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
similarity index 61%
rename from clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
rename to clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
index cfcd878644ddb..0f62dc3d9ab70 100644
--- a/clang-tools-extra/clang-tidy/cert/DontModifyStdNamespaceCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/StdNamespaceModificationCheck.h
@@ -6,21 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
 
 #include "../ClangTidyCheck.h"
 
-namespace clang::tidy::cert {
+namespace clang::tidy::bugprone {
 
 /// Modification of the std or posix namespace can result in undefined behavior.
 /// This check warns for such modifications.
 ///
 /// For the user-facing documentation see:
-/// https://clang.llvm.org/extra/clang-tidy/checks/cert/dcl58-cpp.html
-class DontModifyStdNamespaceCheck : public ClangTidyCheck {
+/// https://clang.llvm.org/extra/clang-tidy/checks/bugprone/std-namespace-modification.html
+class StdNamespaceModificationCheck : public ClangTidyCheck {
 public:
-  DontModifyStdNamespaceCheck(StringRef Name, ClangTidyContext *Context)
+  StdNamespaceModificationCheck(StringRef Name, ClangTidyContext *Context)
       : ClangTidyCheck(Name, Context) {}
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
@@ -29,6 +29,6 @@ class DontModifyStdNamespaceCheck : public ClangTidyCheck {
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
 };
 
-} // namespace clang::tidy::cert
+} // namespace clang::tidy::bugprone
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_CERT_DONT_MODIFY_STD_NAMESPACE_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_STDNAMESPACEMODIFICATIONCHECK_H
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index d5179770008e5..15592d5c03c06 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -19,6 +19,7 @@
 #include "../bugprone/SignedCharMisuseCheck.h"
 #include "../bugprone/SizeofExpressionCheck.h"
 #include "../bugprone/SpuriouslyWakeUpFunctionsCheck.h"
+#include "../bugprone/StdNamespaceModificationCheck.h"
 #include "../bugprone/SuspiciousMemoryComparisonCheck.h"
 #include "../bugprone/ThrowingStaticInitializationCheck.h"
 #include "../bugprone/UncheckedStringToNumberConversionCheck.h"
@@ -36,7 +37,6 @@
 #include "../performance/MoveConstructorInitCheck.h"
 #include "../readability/EnumInitialValueCheck.h"
 #include "../readability/UppercaseLiteralSuffixCheck.h"
-#include "DontModifyStdNamespaceCheck.h"
 #include "FloatLoopCounter.h"
 #include "LimitedRandomnessCheck.h"
 #include "MutatingCopyCheck.h"
@@ -251,7 +251,8 @@ class CERTModule : public ClangTidyModule {
         "cert-dcl51-cpp");
     CheckFactories.registerCheck<misc::NewDeleteOverloadsCheck>(
         "cert-dcl54-cpp");
-    CheckFactories.registerCheck<DontModifyStdNamespaceCheck>("cert-dcl58-cpp");
+    CheckFactories.registerCheck<bugprone::StdNamespaceModificationCheck>(
+        "cert-dcl58-cpp");
     CheckFactories.registerCheck<google::build::UnnamedNamespaceInHeaderCheck>(
         "cert-dcl59-cpp");
     // ERR
diff --git a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
index db3b2f5a08286..27cf392ce0bb1 100644
--- a/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/cert/CMakeLists.txt
@@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_library(clangTidyCERTModule STATIC
   CERTTidyModule.cpp
-  DontModifyStdNamespaceCheck.cpp
   FloatLoopCounter.cpp
   LimitedRandomnessCheck.cpp
   MutatingCopyCheck.cpp
diff --git a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
index 16febeca70809..b557066d979f5 100644
--- a/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
+++ b/clang-tools-extra/clangd/refactor/tweaks/OverridePureVirtuals.cpp
@@ -79,7 +79,7 @@
 
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/DeclCXX.h"
-#include "clang/AST/Type.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/SourceLocation.h"
@@ -116,7 +116,8 @@ std::string removePureVirtualSyntax(const std::string &MethodDecl,
 
     DeclString += Tk.text();
     if (Tk.Kind != tok::l_paren && Next.Kind != tok::comma &&
-        Next.Kind != tok::r_paren && Next.Kind != tok::l_paren)
+        Next.Kind != tok::r_paren && Next.Kind != tok::l_paren &&
+        Tk.Kind != tok::coloncolon && Next.Kind != tok::coloncolon)
       DeclString += ' ';
   }
   // Trim the last whitespace.
diff --git a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
index b7dcbee1650ec..72095ab2f5982 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/OverridePureVirtualsTests.cpp
@@ -715,6 +715,45 @@ class D : public B {
   EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
 }
 
+TEST_F(OverridePureVirtualsTests, QualifiedNames) {
+  constexpr auto Before = R"cpp(
+namespace foo { struct S{}; namespace bar { struct S2{}; } }
+
+class B {
+public:
+  virtual foo::S foo(int var = 0) = 0;
+  virtual foo::bar::S2 bar(int var = 0) = 0;
+};
+
+class ^D : public B {};
+)cpp";
+
+  constexpr auto Expected = R"cpp(
+namespace foo { struct S{}; namespace bar { struct S2{}; } }
+
+class B {
+public:
+  virtual foo::S foo(int var = 0) = 0;
+  virtual foo::bar::S2 bar(int var = 0) = 0;
+};
+
+class D : public B {
+public:
+  foo::S foo(int var = 0) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `foo` is not implemented.");
+  }
+
+  foo::bar::S2 bar(int var = 0) override {
+    // TODO: Implement this pure virtual method.
+    static_assert(false, "Method `bar` is not implemented.");
+  }
+};
+)cpp";
+  auto Applied = apply(Before);
+  EXPECT_EQ(Expected, Applied) << "Applied result:\n" << Applied;
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang
diff --git a/clang-tools-extra/docs/Maintainers.rst b/clang-tools-extra/docs/Maintainers.rst
new file mode 100644
index 0000000000000..f78e9ecf279a6
--- /dev/null
+++ b/clang-tools-extra/docs/Maintainers.rst
@@ -0,0 +1 @@
+.. include:: ../Maintainers.rst
\ No newline at end of file
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 35a64395f04e9..88eb0ebff4501 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -244,6 +244,11 @@ New check aliases
   <clang-tidy/checks/modernize/avoid-variadic-functions>`
   keeping initial check as an alias to the new one.
 
+- Renamed :doc:`cert-dcl58-cpp <clang-tidy/checks/cert/dcl58-cpp>` to
+  :doc:`bugprone-std-namespace-modification
+  <clang-tidy/checks/bugprone/std-namespace-modification>`
+  keeping initial check as an alias to the new one.
+
 - Renamed :doc:`cert-env33-c <clang-tidy/checks/cert/env33-c>` to
   :doc:`bugprone-command-processor
   <clang-tidy/checks/bugprone/command-processor>`
@@ -379,7 +384,8 @@ Changes in existing checks
   <clang-tidy/checks/misc/const-correctness>` check to avoid false
   positives when pointers is transferred to non-const references
   and avoid false positives of function pointer and fix false
-  positives on return of non-const pointer.
+  positives on return of non-const pointer and fix false positives on
+  pointer-to-member operator.
 
 - Improved :doc:`misc-header-include-cycle
   <clang-tidy/checks/misc/header-include-cycle>` check performance.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
new file mode 100644
index 0000000000000..c6e5608280264
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/std-namespace-modification.rst
@@ -0,0 +1,63 @@
+.. title:: clang-tidy - bugprone-std-namespace-modification
+
+bugprone-std-namespace-modification
+===================================
+
+Warns on modifications of the ``std`` or ``posix`` namespaces which can
+result in undefined behavior.
+
+The ``std`` (or ``posix``) namespace is allowed to be extended with (class or
+function) template specializations that depend on an user-defined type (a type
+that is not defined in the standard system headers).
+
+The check detects the following (user provided) declarations in namespace ``std`` or ``posix``:
+
+- Anything that is not a template specialization.
+- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument.
+- Explicit specializations of any member function of a standard library class template.
+- Explicit specializations of any member function template of a standard library class or class template.
+- Explicit or partial specialization of any member class template of a standard library class or class template.
+
+Examples:
+
+.. code-block:: c++
+
+  namespace std {
+    int x; // warning: modification of 'std' namespace can result in undefined behavior [bugprone-dont-modify-std-namespace]
+  }
+
+  namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior
+  }
+
+  template <>
+  struct ::std::hash<long> { // warning: modification of 'std' namespace can result in undefined behavior
+    unsigned long operator()(const long &K) const {
+      return K;
+    }
+  };
+
+  struct MyData { long data; };
+
+  template <>
+  struct ::std::hash<MyData> { // no warning: specialization with user-defined type
+    unsigned long operator()(const MyData &K) const {
+      return K.data;
+    }
+  };
+
+  namespace std {
+    template <>
+    void swap<bool>(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior
+
+    template <>
+    bool less<void>::operator()<MyData &&, MyData &&>(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior
+      return true;
+    }
+  }
+
+References
+----------
+
+This check corresponds to the CERT C++ Coding Standard rule
+`DCL58-CPP. Do not modify the standard namespaces
+<https://www.securecoding.cert.org/confluence/display/cplusplus/DCL58-CPP.+Do+not+modify+the+standard+namespaces>`_.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
index fbcc6281a8898..1b8c2c4f97dde 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/dcl58-cpp.rst
@@ -3,57 +3,9 @@
 cert-dcl58-cpp
 ==============
 
-Modification of the ``std`` or ``posix`` namespace can result in undefined
-behavior.
-This check warns for such modifications.
-The ``std`` (or ``posix``) namespace is allowed to be extended with (class or
-function) template specializations that depend on an user-defined type (a type
-that is not defined in the standard system headers).
-
-The check detects the following (user provided) declarations in namespace ``std`` or ``posix``:
-
-- Anything that is not a template specialization.
-- Explicit specializations of any standard library function template or class template, if it does not have any user-defined type as template argument.
-- Explicit specializations of any member function of a standard library class template.
-- Explicit specializations of any member function template of a standard library class or class template.
-- Explicit or partial specialization of any member class template of a standard library class or class template.
-
-Examples:
-
-.. code-block:: c++
-
-  namespace std {
-    int x; // warning: modification of 'std' namespace can result in undefined behavior [cert-dcl58-cpp]
-  }
-
-  namespace posix::a { // warning: modification of 'posix' namespace can result in undefined behavior
-  }
-
-  template <>
-  struct ::std::hash<long> { // warning: modification of 'std' namespace can result in undefined behavior
-    unsigned long operator()(const long &K) const {
-      return K;
-    }
-  };
-
-  struct MyData { long data; };
-
-  template <>
-  struct ::std::hash<MyData> { // no warning: specialization with user-defined type
-    unsigned long operator()(const MyData &K) const {
-      return K.data;
-    }
-  };
-
-  namespace std {
-    template <>
-    void swap<bool>(bool &a, bool &b); // warning: modification of 'std' namespace can result in undefined behavior
-
-    template <>
-    bool less<void>::operator()<MyData &&, MyData &&>(MyData &&, MyData &&) const { // warning: modification of 'std' namespace can result in undefined behavior
-      return true;
-    }
-  }
+The `cert-dcl58-cpp` is an aliaes, please see
+`bugprone-std-namespace-modification <../bugprone/std-namespace-modification.html>`_
+for more information.
 
 This check corresponds to the CERT C++ Coding Standard rule
 `DCL58-CPP. Do not modify the standard namespaces
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index e14ac715cfeeb..5094bcc90caf3 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -141,6 +141,7 @@ Clang-Tidy Checks
    :doc:`bugprone-sizeof-expression <bugprone/sizeof-expression>`,
    :doc:`bugprone-spuriously-wake-up-functions <bugprone/spuriously-wake-up-functions>`,
    :doc:`bugprone-standalone-empty <bugprone/standalone-empty>`, "Yes"
+   :doc:`bugprone-std-namespace-modification <bugprone/std-namespace-modification>`,
    :doc:`bugprone-string-constructor <bugprone/string-constructor>`, "Yes"
    :doc:`bugprone-string-integer-assignment <bugprone/string-integer-assignment>`, "Yes"
    :doc:`bugprone-string-literal-with-embedded-nul <bugprone/string-literal-with-embedded-nul>`,
@@ -175,7 +176,6 @@ Clang-Tidy Checks
    :doc:`bugprone-unused-return-value <bugprone/unused-return-value>`,
    :doc:`bugprone-use-after-move <bugprone/use-after-move>`,
    :doc:`bugprone-virtual-near-miss <bugprone/virtual-near-miss>`, "Yes"
-   :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`,
    :doc:`cert-err33-c <cert/err33-c>`,
    :doc:`cert-err60-cpp <cert/err60-cpp>`,
    :doc:`cert-flp30-c <cert/flp30-c>`,
@@ -441,6 +441,7 @@ Check aliases
    :doc:`cert-dcl50-cpp <cert/dcl50-cpp>`, :doc:`modernize-avoid-variadic-functions <modernize/avoid-variadic-functions>`,
    :doc:`cert-dcl51-cpp <cert/dcl51-cpp>`, :doc:`bugprone-reserved-identifier <bugprone/reserved-identifier>`, "Yes"
    :doc:`cert-dcl54-cpp <cert/dcl54-cpp>`, :doc:`misc-new-delete-overloads <misc/new-delete-overloads>`,
+   :doc:`cert-dcl58-cpp <cert/dcl58-cpp>`, :doc:`bugprone-std-namespace-modification <bugprone/std-namespace-modification>`,
    :doc:`cert-dcl59-cpp <cert/dcl59-cpp>`, :doc:`google-build-namespaces <google/build-namespaces>`,
    :doc:`cert-env33-c <cert/env33-c>`, :doc:`bugprone-command-processor <bugprone/command-processor>`,
    :doc:`cert-err09-cpp <cert/err09-cpp>`, :doc:`misc-throw-by-value-catch-by-reference <misc/throw-by-value-catch-by-reference>`,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
index ec9ef1c60913c..6c994a48d83de 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/llvm/twine-local.rst
@@ -14,3 +14,21 @@ should be generally avoided.
   // becomes
 
   static std::string Moo = (Twine("bark") + "bah").str();
+
+The ``Twine`` does not own the memory of its contents, so it is not
+recommended to use ``Twine`` created from temporary strings or string literals.
+
+.. code-block:: c++
+
+  static Twine getModuleIdentifier(StringRef moduleName) {
+    return moduleName + "_module";
+  }
+  void foo() {
+    Twine result = getModuleIdentifier(std::string{"abc"} + "def");
+    // temporary std::string is destroyed here, result is dangling
+  }
+
+After applying this fix-it hints, the code will use ``std::string`` instead of
+``Twine`` for local variables. However, ``Twine`` has lots of methods that
+are incompatible with ``std::string``, so the user may need to adjust the code
+manually after applying the fix-it hints.
diff --git a/clang-tools-extra/docs/index.rst b/clang-tools-extra/docs/index.rst
index 3f3a99d1b70c6..eba4a2cdbc558 100644
--- a/clang-tools-extra/docs/index.rst
+++ b/clang-tools-extra/docs/index.rst
@@ -22,6 +22,7 @@ Contents
    pp-trace
    clangd <https://clangd.llvm.org/>
    clang-doc
+   Maintainers
 
 
 Doxygen Documentation
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
index b6977cd9ce6c6..0870f60eaa39b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/system-header-simulation.h
@@ -59,7 +59,7 @@ struct X {};
 } // namespace std
 
 // Template specializations that are in a system-header file.
-// The purpose is to test cert-dcl58-cpp (no warnings here).
+// The purpose is to test bugprone-std-namespace-modification (no warnings here).
 namespace std {
 template <>
 void swap<short>(short &, short &){};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
similarity index 97%
rename from clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp
rename to clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
index 01964e7dc6c76..32bcbcaa21c0d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/dcl58-cpp.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/std-namespace-modification.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy -std=c++17-or-later %s cert-dcl58-cpp %t -- -- -I %clang_tidy_headers
+// RUN: %check_clang_tidy -std=c++17-or-later %s bugprone-std-namespace-modification %t -- -- -I %clang_tidy_headers
 
 #include "system-header-simulation.h"
 
@@ -15,7 +15,7 @@ namespace A {
 }
 
 namespace posix {
-// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [cert-dcl58-cpp]
+// CHECK-MESSAGES: :[[@LINE+2]]:11: warning: modification of 'posix' namespace can result in undefined behavior [bugprone-std-namespace-modification]
 // CHECK-MESSAGES: :[[@LINE-2]]:11: note: 'posix' namespace opened here
 namespace foo {
 int foobar;
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
index ea9896e1cfde2..fccbd9b3740bd 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-include.cpp
@@ -39,7 +39,6 @@
 // CHECK-NEXT:   Reason: EnterFile
 // CHECK-NEXT:   FileType: C_User
 // CHECK-NEXT:   PrevFID: (invalid)
-// CHECK:      - Callback: MacroDefined
 // CHECK:      - Callback: FileChanged
 // CHECK-NEXT:   Loc: "<built-in>:1:1"
 // CHECK-NEXT:   Reason: ExitFile
diff --git a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
index 7c2a231101070..5bd38e0dade28 100644
--- a/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
+++ b/clang-tools-extra/test/pp-trace/pp-trace-macro.cpp
@@ -40,7 +40,6 @@ X
 // CHECK-NEXT:   MacroNameTok: __STDC_EMBED_EMPTY__
 // CHECK-NEXT:   MacroDirective: MD_Define
 // CHECK:      - Callback: MacroDefined
-// CHECK:      - Callback: MacroDefined
 // CHECK-NEXT:   MacroNameTok: MACRO
 // CHECK-NEXT:   MacroDirective: MD_Define
 // CHECK-NEXT: - Callback: MacroExpands
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index baa0bbb5ea631..b7c18c2b5c6cf 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -2409,6 +2409,16 @@ those modes.
 Use ``__has_feature(c_fixed_enum)`` to determine whether support for fixed
 underlying types is available in C23 and later.
 
+Enumerations with no enumerators
+--------------------------------
+
+Clang provides support for Microsoft extensions to support enumerations with no enumerators.
+
+.. code-block:: c++
+
+  typedef enum empty { } A;
+
+
 Interoperability with C++11 lambdas
 -----------------------------------
 
diff --git a/clang/docs/LibClang.rst b/clang/docs/LibClang.rst
index e747022b9c173..6c62bcb5f8c29 100644
--- a/clang/docs/LibClang.rst
+++ b/clang/docs/LibClang.rst
@@ -38,6 +38,7 @@ Code example
 
 .. code-block:: cpp
 
+  // main.cpp
   #include <clang-c/Index.h>
   #include <iostream>
 
@@ -57,6 +58,22 @@ Code example
     CXCursor cursor = clang_getTranslationUnitCursor(unit); //Obtain a cursor at the root of the translation unit
   }
 
+.. code-block:: cmake
+
+  # CMakeLists.txt
+  cmake_minimum_required(VERSION 3.20)
+  project(my_clang_tool VERSION 0.1.0)
+
+  # This will find the default system installation of Clang; if you want to
+  # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang
+  # to the CMake configure command, where /foobar is the build directory where
+  # you built Clang.
+  find_package(Clang CONFIG REQUIRED)
+
+  add_executable(my_clang_tool main.cpp)
+  target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS})
+  target_link_libraries(my_clang_tool PRIVATE libclang)
+
 Visiting elements of an AST
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The elements of an AST can be recursively visited with pre-order traversal with ``clang_visitChildren``.
@@ -283,6 +300,7 @@ Complete example code
 
 .. code-block:: cpp
 
+  // main.cpp
   #include <clang-c/Index.h>
   #include <iostream>
 
@@ -356,6 +374,21 @@ Complete example code
     );
   }
 
+.. code-block:: cmake
+
+  # CMakeLists.txt
+  cmake_minimum_required(VERSION 3.20)
+  project(my_clang_tool VERSION 0.1.0)
+
+  # This will find the default system installation of Clang; if you want to
+  # use a different build of clang, pass -DClang_DIR=/foobar/lib/cmake/clang
+  # to the CMake configure command, where /foobar is the build directory where
+  # you built Clang.
+  find_package(Clang CONFIG REQUIRED)
+
+  add_executable(my_clang_tool main.cpp)
+  target_include_directories(my_clang_tool PRIVATE ${CLANG_INCLUDE_DIRS})
+  target_link_libraries(my_clang_tool PRIVATE libclang)
 
 .. _Index.h: https://github.com/llvm/llvm-project/blob/main/clang/include/clang-c/Index.h
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6959e61cac980..32f669f8d70d8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -209,6 +209,8 @@ C23 Feature Support
   `WG14 N2710 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2710.htm>`_.
 - Fixed accepting as compatible unnamed tag types with the same fields within
   the same translation unit but from different types.
+- ``-MG`` now silences the "file not found" errors with ``#embed`` when
+  scanning for dependencies and encountering an unknown file. #GH165632
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
@@ -355,7 +357,7 @@ Improvements to Clang's diagnostics
   potential misaligned members get processed before they can get discarded.
   (#GH144729)
 
-- Clang now emits dignostic with correct message in case of assigning to const reference captured in lambda. (#GH105647)
+- Clang now emits a diagnostic with the correct message in case of assigning to const reference captured in lambda. (#GH105647)
 
 - Fixed false positive in ``-Wmissing-noreturn`` diagnostic when it was requiring the usage of
   ``[[noreturn]]`` on lambdas before C++23 (#GH154493).
@@ -395,6 +397,11 @@ Improvements to Clang's diagnostics
   that were previously incorrectly accepted in case of other irrelevant
   conditions are now consistently diagnosed, identical to C++ mode.
 
+- Fix false-positive unused label diagnostic when a label is used in a named break
+  or continue (#GH166013)
+- Clang now emits a diagnostic in case `vector_size` or `ext_vector_type`
+  attributes are used with a negative size (#GH165463).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
@@ -437,6 +444,7 @@ Bug Fixes in This Version
 - Fixed a failed assertion with empty filename arguments in ``__has_embed``. (#GH159898)
 - Fixed a failed assertion with empty filename in ``#embed`` directive. (#GH162951)
 - Fixed a crash triggered by unterminated ``__has_embed``. (#GH162953)
+- Accept empty enumerations in MSVC-compatible C mode. (#GH114402)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -457,6 +465,7 @@ Bug Fixes to Attribute Support
 - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075)
 - Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
 - Fix handling of parameter indexes when an attribute is applied to a C++23 explicit object member function.
+- Fixed several false positives and false negatives in function effect (`nonblocking`) analysis. (#GH166078) (#GH166101) (#GH166110)
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
@@ -471,7 +480,7 @@ Bug Fixes to C++ Support
   casts that are guaranteed to fail (#GH137518).
 - Fix bug rejecting partial specialization of variable templates with auto NTTPs (#GH118190).
 - Fix a crash if errors "member of anonymous [...] redeclares" and
-  "intializing multiple members of union" coincide (#GH149985).
+  "initializing multiple members of union" coincide (#GH149985).
 - Fix a crash when using ``explicit(bool)`` in pre-C++11 language modes. (#GH152729)
 - Fix the parsing of variadic member functions when the ellipis immediately follows a default argument.(#GH153445)
 - Fixed a bug that caused ``this`` captured by value in a lambda with a dependent explicit object parameter to not be
@@ -501,6 +510,7 @@ Bug Fixes to C++ Support
   nontrivial member when another member has an initializer. (#GH81774)
 - Fixed a template depth issue when parsing lambdas inside a type constraint. (#GH162092)
 - Diagnose unresolved overload sets in non-dependent compound requirements. (#GH51246) (#GH97753)
+- Fix a crash when extracting unavailable member type from alias in template deduction. (#GH165560)
 
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/APNumericStorage.h b/clang/include/clang/AST/APNumericStorage.h
index e1948a552bf7e..04424086b98cf 100644
--- a/clang/include/clang/AST/APNumericStorage.h
+++ b/clang/include/clang/AST/APNumericStorage.h
@@ -41,9 +41,8 @@ class APNumericStorage {
   llvm::APInt getIntValue() const {
     unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
     if (NumWords > 1)
-      return llvm::APInt(BitWidth, NumWords, pVal);
-    else
-      return llvm::APInt(BitWidth, VAL);
+      return llvm::APInt(BitWidth, llvm::ArrayRef(pVal, NumWords));
+    return llvm::APInt(BitWidth, VAL);
   }
   void setIntValue(const ASTContext &C, const llvm::APInt &Val);
 };
diff --git a/clang/include/clang/AST/AbstractBasicReader.h b/clang/include/clang/AST/AbstractBasicReader.h
index 0d187eb49d6ca..064a342aa0684 100644
--- a/clang/include/clang/AST/AbstractBasicReader.h
+++ b/clang/include/clang/AST/AbstractBasicReader.h
@@ -173,7 +173,7 @@ class DataStreamBasicReader : public BasicReaderBase<Impl> {
     llvm::SmallVector<uint64_t, 4> data;
     for (uint32_t i = 0; i != numWords; ++i)
       data.push_back(asImpl().readUInt64());
-    return llvm::APInt(bitWidth, numWords, &data[0]);
+    return llvm::APInt(bitWidth, data);
   }
 
   llvm::FixedPointSemantics readFixedPointSemantics() {
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
index 91ffbb169f947..eb532bc8be3a7 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h
@@ -23,7 +23,11 @@
 #include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LiveOrigins.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LoanPropagation.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
 
 namespace clang::lifetimes {
 
@@ -44,10 +48,6 @@ class LifetimeSafetyReporter {
                                   Confidence Confidence) {}
 };
 
-/// The main entry point for the analysis.
-void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
-                               LifetimeSafetyReporter *Reporter);
-
 namespace internal {
 /// An object to hold the factories for immutable collections, ensuring
 /// that all created states share the same underlying memory management.
@@ -60,6 +60,7 @@ struct LifetimeFactory {
 /// Running the lifetime safety analysis and querying its results. It
 /// encapsulates the various dataflow analyses.
 class LifetimeSafetyAnalysis {
+
 public:
   LifetimeSafetyAnalysis(AnalysisDeclContext &AC,
                          LifetimeSafetyReporter *Reporter);
@@ -82,6 +83,12 @@ class LifetimeSafetyAnalysis {
   std::unique_ptr<LoanPropagationAnalysis> LoanPropagation;
 };
 } // namespace internal
+
+/// The main entry point for the analysis.
+std::unique_ptr<internal::LifetimeSafetyAnalysis>
+runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
+                          LifetimeSafetyReporter *Reporter);
+
 } // namespace clang::lifetimes
 
 #endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H
diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
index ba138b078b379..26686a63e9204 100644
--- a/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
+++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety/Origins.h
@@ -16,7 +16,10 @@
 
 #include "clang/AST/Decl.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/Utils.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang::lifetimes::internal {
 
@@ -76,6 +79,8 @@ class OriginManager {
 
   void dump(OriginID OID, llvm::raw_ostream &OS) const;
 
+  const llvm::StringMap<unsigned> getMissingOrigins() const;
+
 private:
   OriginID getNextOriginID() { return NextOriginID++; }
 
@@ -85,6 +90,7 @@ class OriginManager {
   llvm::SmallVector<Origin> AllOrigins;
   llvm::DenseMap<const clang::ValueDecl *, OriginID> DeclToOriginID;
   llvm::DenseMap<const clang::Expr *, OriginID> ExprToOriginID;
+  llvm::StringMap<unsigned> ExprTypeToMissingOriginCount;
 };
 } // namespace clang::lifetimes::internal
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 2b400b012d6ed..0275447e1090a 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -5235,6 +5235,12 @@ def HLSLGetSpirvSpecConstant : LangBuiltin<"HLSL_LANG">, HLSLScalarTemplate {
   let Prototype = "T(unsigned int, T)";
 }
 
+def HLSLF16ToF32 : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_elementwise_f16tof32"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
 // Builtins for XRay.
 def XRayCustomEvent : Builtin {
   let Spellings = ["__xray_customevent"];
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 9e877b92eac68..4388c09423a21 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1765,75 +1765,48 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
   def scattersiv8si : X86Builtin<"void(void *, unsigned char, _Vector<8, int>, _Vector<8, int>, _Constant int)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2vard128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
-}
-
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
-}
-
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def vpermi2varq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>, _Vector<2, long long int>)">;
+  def vpermi2varps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>, _Vector<4, float>)">;
+  def vpermi2varpd128 : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>, _Vector<2, double>)">;
 }
 
-let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def vpermi2vard256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
   def vpermi2varq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>)">;
+  def vpermi2varps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>, _Vector<8, float>)">;
+  def vpermi2varpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>, _Vector<4, double>)">;
 }
 
-let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
+  def vpermi2vard512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
   def vpermi2varq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Vector<8, long long int>)">;
+  def vpermi2varps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>, _Vector<16, float>)">;
+  def vpermi2varpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, long long int>, _Vector<8, double>)">;
 }
 
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2varqi128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
 }
 
-let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vbmi,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpermi2varqi256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
 }
 
-let Features = "avx512vbmi", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512vbmi", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpermi2varqi512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Vector<64, char>)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vpermi2varhi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
-let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vpermi2varhi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
-let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def vpermi2varhi512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
diff --git a/clang/include/clang/Basic/DebugOptions.def b/clang/include/clang/Basic/DebugOptions.def
index a768b12fa4e0d..ea3636ffa1af1 100644
--- a/clang/include/clang/Basic/DebugOptions.def
+++ b/clang/include/clang/Basic/DebugOptions.def
@@ -46,6 +46,8 @@ ENUM_DEBUGOPT(EmitDwarfUnwind, EmitDwarfUnwindType, 2,
 DEBUGOPT(NoDwarfDirectoryAsm , 1, 0, Benign) ///< Set when -fno-dwarf-directory-asm
                                              ///< is enabled.
 
+DEBUGOPT(Dwarf2CFIAsm, 1, 0, NotCompatible) ///< Set when -fdwarf2-cfi-asm is enabled.
+
 DEBUGOPT(NoInlineLineTables, 1, 0, Benign) ///< Whether debug info should contain
                                            ///< inline line tables.
 
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 8aa3489a2a62b..1e0321de3f4b6 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1436,6 +1436,7 @@ def MicrosoftDrectveSection : DiagGroup<"microsoft-drectve-section">;
 def MicrosoftInclude : DiagGroup<"microsoft-include">;
 def MicrosoftCppMacro : DiagGroup<"microsoft-cpp-macro">;
 def MicrosoftFixedEnum : DiagGroup<"microsoft-fixed-enum">;
+def MicrosoftEmptyEnum : DiagGroup<"microsoft-empty-enum">;
 def MicrosoftSealed : DiagGroup<"microsoft-sealed">;
 def MicrosoftAbstract : DiagGroup<"microsoft-abstract">;
 def MicrosoftUnqualifiedFriend : DiagGroup<"microsoft-unqualified-friend">;
@@ -1489,7 +1490,8 @@ def Microsoft : DiagGroup<"microsoft",
      MicrosoftConstInit, MicrosoftVoidPseudoDtor, MicrosoftAnonTag,
      MicrosoftCommentPaste, MicrosoftEndOfFile,
      MicrosoftInitFromPredefined, MicrosoftStringLiteralFromPredefined,
-     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction]>;
+     MicrosoftInconsistentDllImport, MicrosoftInlineOnNonFunction,
+     MicrosoftEmptyEnum]>;
 
 def ClangClPch : DiagGroup<"clang-cl-pch">;
 
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index e5e071f43fa75..aa0ccb0c05101 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -116,6 +116,9 @@ def err_enumerator_unnamed_no_def : Error<
 def ext_ms_c_enum_fixed_underlying_type : Extension<
   "enumeration types with a fixed underlying type are a Microsoft extension">,
   InGroup<MicrosoftFixedEnum>;
+def ext_ms_c_empty_enum_type : Extension<
+  "empty enumeration types are a Microsoft extension">,
+  InGroup<MicrosoftEmptyEnum>;
 def ext_c23_enum_fixed_underlying_type : Extension<
   "enumeration types with a fixed underlying type are a C23 extension">,
   InGroup<C23>;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 4e369be0bbb92..fa509536bf021 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3510,6 +3510,8 @@ def err_init_method_bad_return_type : Error<
   "init methods must return an object pointer type, not %0">;
 def err_attribute_invalid_size : Error<
   "vector size not an integral multiple of component size">;
+def err_attribute_vec_negative_size
+    : Error<"vector must have non-negative size">;
 def err_attribute_zero_size : Error<"zero %0 size">;
 def err_attribute_size_too_large : Error<"%0 size too large">;
 def err_typecheck_sve_rvv_ambiguous : Error<
diff --git a/clang/include/clang/Basic/SourceManager.h b/clang/include/clang/Basic/SourceManager.h
index ed967fd47dc83..6d9d074d78026 100644
--- a/clang/include/clang/Basic/SourceManager.h
+++ b/clang/include/clang/Basic/SourceManager.h
@@ -1286,16 +1286,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// If the location is an expansion record, walk through it until we find
   /// the final location expanded.
   FileIDAndOffset getDecomposedExpansionLoc(SourceLocation Loc) const {
-    FileID FID = getFileID(Loc);
-    auto *E = getSLocEntryOrNull(FID);
-    if (!E)
-      return std::make_pair(FileID(), 0);
-
-    unsigned Offset = Loc.getOffset()-E->getOffset();
-    if (Loc.isFileID())
-      return std::make_pair(FID, Offset);
-
-    return getDecomposedExpansionLocSlowCase(E);
+    return getDecomposedLoc(getExpansionLoc(Loc));
   }
 
   /// Decompose the specified location into a raw FileID + Offset pair.
@@ -1303,15 +1294,7 @@ class SourceManager : public RefCountedBase<SourceManager> {
   /// If the location is an expansion record, walk through it until we find
   /// its spelling record.
   FileIDAndOffset getDecomposedSpellingLoc(SourceLocation Loc) const {
-    FileID FID = getFileID(Loc);
-    auto *E = getSLocEntryOrNull(FID);
-    if (!E)
-      return std::make_pair(FileID(), 0);
-
-    unsigned Offset = Loc.getOffset()-E->getOffset();
-    if (Loc.isFileID())
-      return std::make_pair(FID, Offset);
-    return getDecomposedSpellingLocSlowCase(E, Offset);
+    return getDecomposedLoc(getSpellingLoc(Loc));
   }
 
   /// Returns the "included/expanded in" decomposed location of the given
@@ -1979,10 +1962,6 @@ class SourceManager : public RefCountedBase<SourceManager> {
   SourceLocation getSpellingLocSlowCase(SourceLocation Loc) const;
   SourceLocation getFileLocSlowCase(SourceLocation Loc) const;
 
-  FileIDAndOffset
-  getDecomposedExpansionLocSlowCase(const SrcMgr::SLocEntry *E) const;
-  FileIDAndOffset getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E,
-                                                   unsigned Offset) const;
   void computeMacroArgsCache(MacroArgsMap &MacroArgsCache, FileID FID) const;
   void associateFileChunkWithMacroArgExp(MacroArgsMap &MacroArgsCache,
                                          FileID FID,
diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td
index 89e644a078682..0371279aafc08 100644
--- a/clang/include/clang/Basic/riscv_sifive_vector.td
+++ b/clang/include/clang/Basic/riscv_sifive_vector.td
@@ -121,6 +121,13 @@ multiclass RVVVQMACCQOQBuiltinSet<list<list<string>> suffixes_prototypes> {
     defm NAME : RVVOutOp1Op2BuiltinSet<NAME, "s", suffixes_prototypes>;
 }
 
+multiclass RVVVFEXPBuiltinSet<list<list<string>> suffixes_prototypes, string type_range> {
+  let UnMaskedPolicyScheme = HasPassthruOperand,
+      OverloadedName = NAME,
+      Log2LMUL = [-2, -1, 0, 1, 2, 3] in
+    defm NAME : RVVOutBuiltinSet<NAME, type_range, suffixes_prototypes>;
+}
+
 multiclass RVVVFNRCLIPBuiltinSet<string suffix, string prototype, string type_range> {
   let Log2LMUL = [-3, -2, -1, 0, 1, 2],
       Name = NAME,
@@ -145,6 +152,26 @@ let UnMaskedPolicyScheme = HasPolicyOperand in
     defm sf_vqmaccsu_4x8x4 : RVVVQMACCQOQBuiltinSet<[["", "w", "ww(FixedSEW:8)Sv(FixedSEW:8)Uv"]]>;
   }
 
+let RequiredFeatures = ["xsfvfbfexp16e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "y">;
+}
+
+let RequiredFeatures = ["xsfvfexp16e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "x">;
+}
+
+let RequiredFeatures = ["xsfvfexp32e"] in {
+  defm sf_vfexp : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "f">;
+}
+
+let RequiredFeatures = ["xsfvfexpa"] in {
+  defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "xf">;
+}
+
+let RequiredFeatures = ["xsfvfexpa64e"] in {
+  defm sf_vfexpa : RVVVFEXPBuiltinSet<[["v", "v", "vv"]], "d">;
+}
+
 let UnMaskedPolicyScheme = HasPolicyOperand in
   let RequiredFeatures = ["xsfvfwmaccqqq"] in
     defm sf_vfwmacc_4x4x4 : RVVVFWMACCBuiltinSet<[["", "Fw", "FwFwSvv"]]>;
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 2b361ed0982c6..6f9a69e697cc3 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4171,6 +4171,16 @@ def CIR_ATanOp : CIR_UnaryFPToFPBuiltinOp<"atan", "ATanOp"> {
   }];
 }
 
+def CIR_CeilOp : CIR_UnaryFPToFPBuiltinOp<"ceil", "FCeilOp"> {
+  let summary = "Computes the ceiling of the specified value";
+  let description = [{
+    `cir.ceil` computes the ceiling of a given value and returns a result
+    of the same type.
+
+    Floating-point exceptions are ignored, and it does not set `errno`.
+  }];
+}
+
 def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
   let summary = "Computes the floating-point cosine value";
   let description = [{
@@ -4181,6 +4191,16 @@ def CIR_CosOp : CIR_UnaryFPToFPBuiltinOp<"cos", "CosOp"> {
   }];
 }
 
+def CIR_ExpOp : CIR_UnaryFPToFPBuiltinOp<"exp", "ExpOp"> {
+  let summary = "Computes the floating-point base-e exponential value";
+  let description = [{
+    `cir.exp` computes the exponential of a floating-point operand and returns
+    a result of the same type.
+
+    Floating-point exceptions are ignored, and it does not set `errno`.
+  }];
+}
+
 def CIR_FAbsOp : CIR_UnaryFPToFPBuiltinOp<"fabs", "FAbsOp"> {
   let summary = "Computes the floating-point absolute value";
   let description = [{
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 48ef8be9fb782..6f099a7027a10 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -180,6 +180,8 @@ struct MissingFeatures {
   static bool atomicSyncScopeID() { return false; }
   static bool atomicTypes() { return false; }
   static bool atomicUseLibCall() { return false; }
+  static bool atomicMicrosoftVolatile() { return false; }
+  static bool atomicOpenMP() { return false; }
 
   // Global ctor handling
   static bool globalCtorLexOrder() { return false; }
diff --git a/clang/include/clang/CodeGen/ModuleBuilder.h b/clang/include/clang/CodeGen/ModuleBuilder.h
index f1b8229edd362..4298ba06c472e 100644
--- a/clang/include/clang/CodeGen/ModuleBuilder.h
+++ b/clang/include/clang/CodeGen/ModuleBuilder.h
@@ -120,6 +120,23 @@ CodeGenerator *CreateLLVMCodeGen(DiagnosticsEngine &Diags,
                                  llvm::LLVMContext &C,
                                  CoverageSourceInfo *CoverageInfo = nullptr);
 
+namespace CodeGen {
+/// Demangle the artificial function name (\param FuncName) used to encode trap
+/// reasons used in debug info for traps (e.g. __builtin_verbose_trap). See
+/// `CGDebugInfo::CreateTrapFailureMessageFor`.
+///
+/// \param FuncName - The function name to demangle.
+///
+/// \return A std::optional. If demangling succeeds the optional will contain
+/// a pair of StringRefs where the first field is the trap category and the
+/// second is the trap message. These can both be empty. If demangling fails the
+/// optional will not contain a value. Note the returned StringRefs if non-empty
+/// point into the underlying storage for \param FuncName and thus have the same
+/// lifetime.
+std::optional<std::pair<StringRef, StringRef>>
+DemangleTrapReasonInDebugInfo(StringRef FuncName);
+} // namespace CodeGen
+
 } // end namespace clang
 
 #endif
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 20955ef1b852e..11e81e032d5fc 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -741,6 +741,7 @@ def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>,
     "Specify a directory where Flang can find 'lib{,32,64}/gcc{,-cross}/$triple/$version'. "
     "Flang will use the GCC installation with the largest version">;
 def gcc_triple_EQ : Joined<["--"], "gcc-triple=">,
+  Visibility<[ClangOption, FlangOption]>,
   HelpText<"Search for the GCC installation with the specified triple.">;
 def CC : Flag<["-"], "CC">, Visibility<[ClangOption, CC1Option]>,
   Group<Preprocessor_Group>,
@@ -950,9 +951,9 @@ def Xarch__
   the host system, which can be used to suppress incompatible GPU arguments.}]>,
       MetaVarName<"<arch> <arg>">;
 def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[NoXarchOption]>,
-  HelpText<"Pass <arg> to the CUDA/HIP host compilation">, MetaVarName<"<arg>">;
+  HelpText<"Pass <arg> to host compilation in the offloading toolchain">, MetaVarName<"<arg>">;
 def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[NoXarchOption]>,
-  HelpText<"Pass <arg> to the CUDA/HIP device compilation">, MetaVarName<"<arg>">;
+  HelpText<"Pass <arg> to device compilation in the offloading toolchain">, MetaVarName<"<arg>">;
 def Xassembler : Separate<["-"], "Xassembler">,
   HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">,
   Group<CompileOnly_Group>;
@@ -2154,8 +2155,12 @@ defm dollars_in_identifiers : BoolFOption<"dollars-in-identifiers",
   PosFlag<SetTrue, [], [ClangOption], "Allow">,
   NegFlag<SetFalse, [], [ClangOption], "Disallow">,
   BothFlags<[], [ClangOption, CC1Option], " '$' in identifiers">>;
-def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
-def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
+
+defm dwarf2_cfi_asm
+    : BoolFOption<"dwarf2-cfi-asm", CodeGenOpts<"Dwarf2CFIAsm">, DefaultFalse,
+                  PosFlag<SetTrue, [], [ClangOption, CC1Option]>,
+                  NegFlag<SetFalse>>;
+
 defm dwarf_directory_asm : BoolFOption<"dwarf-directory-asm",
   CodeGenOpts<"NoDwarfDirectoryAsm">, DefaultFalse,
   NegFlag<SetTrue, [], [ClangOption, CC1Option]>,
diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h
index 4103c3f006a8f..604039ef61cb7 100644
--- a/clang/include/clang/Sema/AnalysisBasedWarnings.h
+++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h
@@ -14,7 +14,10 @@
 #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H
 
 #include "clang/AST/Decl.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
+#include "clang/Analysis/AnalysisDeclContext.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
 #include <memory>
 
 namespace clang {
@@ -95,6 +98,9 @@ class AnalysisBasedWarnings {
   /// a single function.
   unsigned MaxUninitAnalysisBlockVisitsPerFunction;
 
+  /// Map from expressions missing origin in OriginManager to their counts.
+  llvm::StringMap<unsigned> MissingOriginCount;
+
   /// @}
 
 public:
@@ -116,6 +122,9 @@ class AnalysisBasedWarnings {
   Policy &getPolicyOverrides() { return PolicyOverrides; }
 
   void PrintStats() const;
+
+  void FindMissingOrigins(AnalysisDeclContext &AC,
+                          clang::lifetimes::internal::FactManager &FactMgr);
 };
 
 } // namespace sema
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index fd0903f2e652c..638028f84ff24 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -436,8 +436,28 @@ LLVM_DUMP_METHOD void Descriptor::dumpFull(unsigned Offset,
 
       FO += ElemDesc->getAllocSize();
     }
+  } else if (isPrimitiveArray()) {
+    OS.indent(Spaces) << "Elements: " << getNumElems() << '\n';
+    OS.indent(Spaces) << "Element type: " << primTypeToString(getPrimType())
+                      << '\n';
+    unsigned FO = Offset + sizeof(InitMapPtr);
+    for (unsigned I = 0; I != getNumElems(); ++I) {
+      OS.indent(Spaces) << "Element " << I << " offset: " << FO << '\n';
+      FO += getElemSize();
+    }
   } else if (isRecord()) {
     ElemRecord->dump(OS, Indent + 1, Offset);
+    unsigned I = 0;
+    for (const Record::Field &F : ElemRecord->fields()) {
+      OS.indent(Spaces) << "- Field " << I << ": ";
+      {
+        ColorScope SC(OS, true, {llvm::raw_ostream::BRIGHT_RED, true});
+        OS << F.Decl->getName();
+      }
+      OS << ". Offset " << (Offset + F.Offset) << "\n";
+      F.Desc->dumpFull(Offset + F.Offset, Indent + 1);
+      ++I;
+    }
   } else if (isPrimitive()) {
   } else {
   }
diff --git a/clang/lib/AST/ByteCode/Floating.h b/clang/lib/AST/ByteCode/Floating.h
index 659892e720abf..cc918dc12deb6 100644
--- a/clang/lib/AST/ByteCode/Floating.h
+++ b/clang/lib/AST/ByteCode/Floating.h
@@ -45,7 +45,8 @@ class Floating final {
     if (singleWord())
       return APFloat(getSemantics(), APInt(BitWidth, Val));
     unsigned NumWords = numWords();
-    return APFloat(getSemantics(), APInt(BitWidth, NumWords, Memory));
+    return APFloat(getSemantics(),
+                   APInt(BitWidth, llvm::ArrayRef(Memory, NumWords)));
   }
 
 public:
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 6683db941c736..b11e6eea28e3f 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -63,7 +63,7 @@ template <bool Signed> class IntegralAP final {
     if (singleWord())
       return APInt(BitWidth, Val, Signed);
     unsigned NumWords = llvm::APInt::getNumWords(BitWidth);
-    return llvm::APInt(BitWidth, NumWords, Memory);
+    return llvm::APInt(BitWidth, llvm::ArrayRef(Memory, NumWords));
   }
 
 public:
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8b57b963c538f..9991e365addb8 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3415,18 +3415,46 @@ static bool interp__builtin_ia32_shuffle_generic(
         GetSourceIndex) {
 
   assert(Call->getNumArgs() == 3);
-  unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+
+  unsigned ShuffleMask = 0;
+  Pointer A, MaskVector, B;
+
+  QualType Arg2Type = Call->getArg(2)->getType();
+  bool IsVectorMask = false;
+  if (Arg2Type->isVectorType()) {
+    IsVectorMask = true;
+    B = S.Stk.pop<Pointer>();
+    MaskVector = S.Stk.pop<Pointer>();
+    A = S.Stk.pop<Pointer>();
+  } else if (Arg2Type->isIntegerType()) {
+    ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+    B = S.Stk.pop<Pointer>();
+    A = S.Stk.pop<Pointer>();
+  } else {
+    return false;
+  }
 
   QualType Arg0Type = Call->getArg(0)->getType();
   const auto *VecT = Arg0Type->castAs<VectorType>();
   PrimType ElemT = *S.getContext().classify(VecT->getElementType());
   unsigned NumElems = VecT->getNumElements();
 
-  const Pointer &B = S.Stk.pop<Pointer>();
-  const Pointer &A = S.Stk.pop<Pointer>();
   const Pointer &Dst = S.Stk.peek<Pointer>();
 
+  PrimType MaskElemT = PT_Uint32;
+  if (IsVectorMask) {
+    QualType Arg1Type = Call->getArg(1)->getType();
+    const auto *MaskVecT = Arg1Type->castAs<VectorType>();
+    QualType MaskElemType = MaskVecT->getElementType();
+    MaskElemT = *S.getContext().classify(MaskElemType);
+  }
+
   for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
+    if (IsVectorMask) {
+      INT_TYPE_SWITCH(MaskElemT, {
+        ShuffleMask = static_cast<unsigned>(MaskVector.elem<T>(DstIdx));
+      });
+    }
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
 
     if (SrcIdx < 0) {
@@ -4434,6 +4462,60 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
             return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)};
           }
         });
+  case X86::BI__builtin_ia32_vpermi2varq128:
+  case X86::BI__builtin_ia32_vpermi2varpd128:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x1;
+          unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2vard128:
+  case X86::BI__builtin_ia32_vpermi2varps128:
+  case X86::BI__builtin_ia32_vpermi2varq256:
+  case X86::BI__builtin_ia32_vpermi2varpd256:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x3;
+          unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varhi128:
+  case X86::BI__builtin_ia32_vpermi2vard256:
+  case X86::BI__builtin_ia32_vpermi2varps256:
+  case X86::BI__builtin_ia32_vpermi2varq512:
+  case X86::BI__builtin_ia32_vpermi2varpd512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x7;
+          unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi128:
+  case X86::BI__builtin_ia32_vpermi2varhi256:
+  case X86::BI__builtin_ia32_vpermi2vard512:
+  case X86::BI__builtin_ia32_vpermi2varps512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0xF;
+          unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi256:
+  case X86::BI__builtin_ia32_vpermi2varhi512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x1F;
+          unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
+  case X86::BI__builtin_ia32_vpermi2varqi512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          int Offset = ShuffleMask & 0x3F;
+          unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
+          return std::pair<unsigned, int>{SrcIdx, Offset};
+        });
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512:
diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp
index e0b2852f0e906..2425373ab2ef8 100644
--- a/clang/lib/AST/ByteCode/Program.cpp
+++ b/clang/lib/AST/ByteCode/Program.cpp
@@ -218,21 +218,42 @@ UnsignedOrNone Program::createGlobal(const ValueDecl *VD, const Expr *Init) {
     return std::nullopt;
 
   Global *NewGlobal = Globals[*Idx];
+  // Note that this loop has one iteration where Redecl == VD.
   for (const Decl *Redecl : VD->redecls()) {
-    unsigned &PIdx = GlobalIndices[Redecl];
+
+    // If this redecl was registered as a dummy variable, it is now a proper
+    // global variable and points to the block we just created.
+    if (auto DummyIt = DummyVariables.find(Redecl);
+        DummyIt != DummyVariables.end()) {
+      assert(!Globals[DummyIt->second]->block()->hasPointers());
+      Globals[DummyIt->second] = NewGlobal;
+      DummyVariables.erase(DummyIt);
+    }
+    // If the redeclaration hasn't been registered yet at all, we just set its
+    // global index to Idx. If it has been registered yet, it might have
+    // pointers pointing to it and we need to transfer those pointers to the new
+    // block.
+    auto [Iter, Inserted] = GlobalIndices.try_emplace(Redecl);
+    if (Inserted) {
+      GlobalIndices[Redecl] = *Idx;
+      continue;
+    }
+
     if (Redecl != VD) {
-      if (Block *RedeclBlock = Globals[PIdx]->block();
+      if (Block *RedeclBlock = Globals[Iter->second]->block();
           RedeclBlock->isExtern()) {
-        Globals[PIdx] = NewGlobal;
+
         // All pointers pointing to the previous extern decl now point to the
         // new decl.
         // A previous iteration might've already fixed up the pointers for this
         // global.
         if (RedeclBlock != NewGlobal->block())
           RedeclBlock->movePointersTo(NewGlobal->block());
+
+        Globals[Iter->second] = NewGlobal;
       }
     }
-    PIdx = *Idx;
+    Iter->second = *Idx;
   }
 
   return *Idx;
diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h
index 28fcc97f5339d..cc9127dc77860 100644
--- a/clang/lib/AST/ByteCode/Program.h
+++ b/clang/lib/AST/ByteCode/Program.h
@@ -205,7 +205,6 @@ class Program final {
     const Block *block() const { return &B; }
 
   private:
-    /// Required metadata - does not actually track pointers.
     Block B;
   };
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 97eeba8b9d6cc..8fab6efafb983 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11628,21 +11628,38 @@ static bool evalShuffleGeneric(
   if (!VT)
     return false;
 
-  APSInt MaskImm;
-  if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
-    return false;
-  unsigned ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+  unsigned ShuffleMask = 0;
+  APValue A, MaskVector, B;
+  bool IsVectorMask = false;
 
-  APValue A, B;
-  if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
-      !EvaluateAsRValue(Info, Call->getArg(1), B))
+  QualType Arg2Type = Call->getArg(2)->getType();
+  if (Arg2Type->isVectorType()) {
+    IsVectorMask = true;
+    if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+        !EvaluateAsRValue(Info, Call->getArg(1), MaskVector) ||
+        !EvaluateAsRValue(Info, Call->getArg(2), B))
+      return false;
+  } else if (Arg2Type->isIntegerType()) {
+    APSInt MaskImm;
+    if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
+      return false;
+    ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+    if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+        !EvaluateAsRValue(Info, Call->getArg(1), B))
+      return false;
+  } else {
     return false;
+  }
 
   unsigned NumElts = VT->getNumElements();
-  SmallVector<APValue, 16> ResultElements;
+  SmallVector<APValue, 64> ResultElements;
   ResultElements.reserve(NumElts);
 
   for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
+    if (IsVectorMask) {
+      ShuffleMask = static_cast<unsigned>(
+          MaskVector.getVectorElt(DstIdx).getInt().getZExtValue());
+    }
     auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
 
     if (SrcIdx < 0) {
@@ -13080,6 +13097,84 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
+  case X86::BI__builtin_ia32_vpermi2varq128:
+  case X86::BI__builtin_ia32_vpermi2varpd128: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x1;
+                              unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2vard128:
+  case X86::BI__builtin_ia32_vpermi2varps128:
+  case X86::BI__builtin_ia32_vpermi2varq256:
+  case X86::BI__builtin_ia32_vpermi2varpd256: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x3;
+                              unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varhi128:
+  case X86::BI__builtin_ia32_vpermi2vard256:
+  case X86::BI__builtin_ia32_vpermi2varps256:
+  case X86::BI__builtin_ia32_vpermi2varq512:
+  case X86::BI__builtin_ia32_vpermi2varpd512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x7;
+                              unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi128:
+  case X86::BI__builtin_ia32_vpermi2varhi256:
+  case X86::BI__builtin_ia32_vpermi2vard512:
+  case X86::BI__builtin_ia32_vpermi2varps512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0xF;
+                              unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi256:
+  case X86::BI__builtin_ia32_vpermi2varhi512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x1F;
+                              unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_vpermi2varqi512: {
+    APValue R;
+    if (!evalShuffleGeneric(Info, E, R,
+                            [](unsigned DstIdx, unsigned ShuffleMask) {
+                              int Offset = ShuffleMask & 0x3F;
+                              unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
+                              return std::pair<unsigned, int>{SrcIdx, Offset};
+                            }))
+      return false;
+    return Success(R, E);
+  }
   }
 }
 
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 75b17c545bb78..54c30c05c3e19 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -746,11 +746,14 @@ ExprMutationAnalyzer::Analyzer::findPointeeMemberMutation(const Expr *Exp) {
                     Stm, Context));
   if (MemberCallExpr)
     return MemberCallExpr;
-  const auto Matches =
-      match(stmt(forEachDescendant(
-                memberExpr(hasObjectExpression(canResolveToExprPointee(Exp)))
-                    .bind(NodeID<Expr>::value))),
-            Stm, Context);
+  const auto Matches = match(
+      stmt(forEachDescendant(
+          expr(anyOf(memberExpr(
+                         hasObjectExpression(canResolveToExprPointee(Exp))),
+                     binaryOperator(hasOperatorName("->*"),
+                                    hasLHS(canResolveToExprPointee(Exp)))))
+              .bind(NodeID<Expr>::value))),
+      Stm, Context);
   return findExprMutation(Matches);
 }
 
diff --git a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
index 00c7ed90503e7..d183ce976f946 100644
--- a/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/LifetimeSafety.cpp
@@ -23,9 +23,11 @@
 #include "clang/Analysis/AnalysisDeclContext.h"
 #include "clang/Analysis/CFG.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/Support/raw_ostream.h"
 #include <memory>
 
 namespace clang::lifetimes {
@@ -69,9 +71,12 @@ void LifetimeSafetyAnalysis::run() {
 }
 } // namespace internal
 
-void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
-                               LifetimeSafetyReporter *Reporter) {
-  internal::LifetimeSafetyAnalysis Analysis(AC, Reporter);
-  Analysis.run();
+std::unique_ptr<internal::LifetimeSafetyAnalysis>
+runLifetimeSafetyAnalysis(AnalysisDeclContext &AC,
+                          LifetimeSafetyReporter *Reporter) {
+  std::unique_ptr<internal::LifetimeSafetyAnalysis> Analysis =
+      std::make_unique<internal::LifetimeSafetyAnalysis>(AC, Reporter);
+  Analysis->run();
+  return Analysis;
 }
 } // namespace clang::lifetimes
diff --git a/clang/lib/Analysis/LifetimeSafety/Origins.cpp b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
index ea51a75324e06..b8f705e3377c2 100644
--- a/clang/lib/Analysis/LifetimeSafety/Origins.cpp
+++ b/clang/lib/Analysis/LifetimeSafety/Origins.cpp
@@ -7,6 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/TypeBase.h"
+#include "llvm/ADT/StringMap.h"
 
 namespace clang::lifetimes::internal {
 
@@ -22,6 +25,10 @@ void OriginManager::dump(OriginID OID, llvm::raw_ostream &OS) const {
   OS << ")";
 }
 
+const llvm::StringMap<unsigned> OriginManager::getMissingOrigins() const {
+  return ExprTypeToMissingOriginCount;
+}
+
 Origin &OriginManager::addOrigin(OriginID ID, const clang::ValueDecl &D) {
   AllOrigins.emplace_back(ID, &D);
   return AllOrigins.back();
@@ -37,6 +44,18 @@ OriginID OriginManager::get(const Expr &E) {
   auto It = ExprToOriginID.find(&E);
   if (It != ExprToOriginID.end())
     return It->second;
+
+  // if the expression has no specific origin, increment the missing origin
+  // counter.
+  // const QualType ExprType = E.getType();
+  std::string ExprStr(E.getStmtClassName());
+  ExprStr = ExprStr + "<" + E.getType().getAsString() + ">";
+  auto CountIt = ExprTypeToMissingOriginCount.find(ExprStr);
+  if (CountIt == ExprTypeToMissingOriginCount.end()) {
+    ExprTypeToMissingOriginCount[ExprStr] = 1;
+  } else {
+    CountIt->second++;
+  }
   // If the expression itself has no specific origin, and it's a reference
   // to a declaration, its origin is that of the declaration it refers to.
   // For pointer types, where we don't pre-emptively create an origin for the
diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp
index 938c6485125ee..7dc81c50f87a2 100644
--- a/clang/lib/Basic/SourceManager.cpp
+++ b/clang/lib/Basic/SourceManager.cpp
@@ -907,58 +907,27 @@ getExpansionLocSlowCase(SourceLocation Loc) const {
 
 SourceLocation SourceManager::getSpellingLocSlowCase(SourceLocation Loc) const {
   do {
-    FileIDAndOffset LocInfo = getDecomposedLoc(Loc);
-    Loc = getSLocEntry(LocInfo.first).getExpansion().getSpellingLoc();
-    Loc = Loc.getLocWithOffset(LocInfo.second);
+    const SLocEntry &Entry = getSLocEntry(getFileID(Loc));
+    Loc = Entry.getExpansion().getSpellingLoc().getLocWithOffset(
+        Loc.getOffset() - Entry.getOffset());
   } while (!Loc.isFileID());
   return Loc;
 }
 
 SourceLocation SourceManager::getFileLocSlowCase(SourceLocation Loc) const {
   do {
-    if (isMacroArgExpansion(Loc))
-      Loc = getImmediateSpellingLoc(Loc);
-    else
-      Loc = getImmediateExpansionRange(Loc).getBegin();
+    const SLocEntry &Entry = getSLocEntry(getFileID(Loc));
+    const ExpansionInfo &ExpInfo = Entry.getExpansion();
+    if (ExpInfo.isMacroArgExpansion()) {
+      Loc = ExpInfo.getSpellingLoc().getLocWithOffset(Loc.getOffset() -
+                                                      Entry.getOffset());
+    } else {
+      Loc = ExpInfo.getExpansionLocStart();
+    }
   } while (!Loc.isFileID());
   return Loc;
 }
 
-FileIDAndOffset SourceManager::getDecomposedExpansionLocSlowCase(
-    const SrcMgr::SLocEntry *E) const {
-  // If this is an expansion record, walk through all the expansion points.
-  FileID FID;
-  SourceLocation Loc;
-  unsigned Offset;
-  do {
-    Loc = E->getExpansion().getExpansionLocStart();
-
-    FID = getFileID(Loc);
-    E = &getSLocEntry(FID);
-    Offset = Loc.getOffset()-E->getOffset();
-  } while (!Loc.isFileID());
-
-  return std::make_pair(FID, Offset);
-}
-
-FileIDAndOffset
-SourceManager::getDecomposedSpellingLocSlowCase(const SrcMgr::SLocEntry *E,
-                                                unsigned Offset) const {
-  // If this is an expansion record, walk through all the expansion points.
-  FileID FID;
-  SourceLocation Loc;
-  do {
-    Loc = E->getExpansion().getSpellingLoc();
-    Loc = Loc.getLocWithOffset(Offset);
-
-    FID = getFileID(Loc);
-    E = &getSLocEntry(FID);
-    Offset = Loc.getOffset()-E->getOffset();
-  } while (!Loc.isFileID());
-
-  return std::make_pair(FID, Offset);
-}
-
 /// getImmediateSpellingLoc - Given a SourceLocation object, return the
 /// spelling location referenced by the ID.  This is the first level down
 /// towards the place where the characters that make up the lexed token can be
diff --git a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
index 7db6e283ec0a5..cd4c1f0e5b769 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAtomic.cpp
@@ -27,6 +27,7 @@ class AtomicInfo {
   CharUnits atomicAlign;
   CharUnits valueAlign;
   TypeEvaluationKind evaluationKind = cir::TEK_Scalar;
+  bool useLibCall = true;
   LValue lvalue;
   mlir::Location loc;
 
@@ -62,8 +63,8 @@ class AtomicInfo {
       assert(!cir::MissingFeatures::atomicInfo());
       cgf.cgm.errorNYI(loc, "AtomicInfo: non-simple lvalue");
     }
-
-    assert(!cir::MissingFeatures::atomicUseLibCall());
+    useLibCall = !ctx.getTargetInfo().hasBuiltinAtomic(
+        atomicSizeInBits, ctx.toBits(lvalue.getAlignment()));
   }
 
   QualType getValueType() const { return valueTy; }
@@ -75,6 +76,8 @@ class AtomicInfo {
     assert(!cir::MissingFeatures::atomicInfoGetAtomicPointer());
     return nullptr;
   }
+  bool shouldUseLibCall() const { return useLibCall; }
+  const LValue &getAtomicLValue() const { return lvalue; }
   Address getAtomicAddress() const {
     mlir::Type elemTy;
     if (lvalue.isSimple()) {
@@ -96,6 +99,8 @@ class AtomicInfo {
 
   bool emitMemSetZeroIfNecessary() const;
 
+  mlir::Value getScalarRValValueOrNull(RValue rvalue) const;
+
   /// Cast the given pointer to an integer pointer suitable for atomic
   /// operations on the source.
   Address castToAtomicIntPointer(Address addr) const;
@@ -105,6 +110,9 @@ class AtomicInfo {
   /// copy the value across.
   Address convertToAtomicIntPointer(Address addr) const;
 
+  /// Converts a rvalue to integer value.
+  mlir::Value convertRValueToInt(RValue rvalue, bool cmpxchg = false) const;
+
   /// Copy an atomic r-value into atomic-layout memory.
   void emitCopyIntoMemory(RValue rvalue) const;
 
@@ -195,6 +203,12 @@ Address AtomicInfo::createTempAlloca() const {
   return tempAlloca;
 }
 
+mlir::Value AtomicInfo::getScalarRValValueOrNull(RValue rvalue) const {
+  if (rvalue.isScalar() && (!hasPadding() || !lvalue.isSimple()))
+    return rvalue.getValue();
+  return nullptr;
+}
+
 Address AtomicInfo::castToAtomicIntPointer(Address addr) const {
   auto intTy = mlir::dyn_cast<cir::IntType>(addr.getElementType());
   // Don't bother with int casts if the integer size is the same.
@@ -211,10 +225,38 @@ bool AtomicInfo::emitMemSetZeroIfNecessary() const {
     return false;
 
   cgf.cgm.errorNYI(loc,
-                   "AtomicInfo::emitMemSetZeroIfNecessary: emit memset zero");
+                   "AtomicInfo::emitMemSetZeroIfNecaessary: emit memset zero");
   return false;
 }
 
+/// Return true if \param valueTy is a type that should be casted to integer
+/// around the atomic memory operation. If \param cmpxchg is true, then the
+/// cast of a floating point type is made as that instruction can not have
+/// floating point operands.  TODO: Allow compare-and-exchange and FP - see
+/// comment in CIRGenAtomicExpandPass.cpp.
+static bool shouldCastToInt(mlir::Type valueTy, bool cmpxchg) {
+  if (cir::isAnyFloatingPointType(valueTy))
+    return isa<cir::FP80Type>(valueTy) || cmpxchg;
+  return !isa<cir::IntType>(valueTy) && !isa<cir::PointerType>(valueTy);
+}
+
+mlir::Value AtomicInfo::convertRValueToInt(RValue rvalue, bool cmpxchg) const {
+  // If we've got a scalar value of the right size, try to avoid going
+  // through memory. Floats get casted if needed by AtomicExpandPass.
+  if (mlir::Value value = getScalarRValValueOrNull(rvalue)) {
+    if (!shouldCastToInt(value.getType(), cmpxchg))
+      return cgf.emitToMemory(value, valueTy);
+
+    cgf.cgm.errorNYI(
+        loc, "AtomicInfo::convertRValueToInt: cast scalar rvalue to int");
+    return nullptr;
+  }
+
+  cgf.cgm.errorNYI(
+      loc, "AtomicInfo::convertRValueToInt: cast non-scalar rvalue to int");
+  return nullptr;
+}
+
 /// Copy an r-value into memory as part of storing to an atomic type.
 /// This needs to create a bit-pattern suitable for atomic operations.
 void AtomicInfo::emitCopyIntoMemory(RValue rvalue) const {
@@ -815,6 +857,79 @@ RValue CIRGenFunction::emitAtomicExpr(AtomicExpr *e) {
       e->getExprLoc());
 }
 
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest, bool isInit) {
+  bool isVolatile = dest.isVolatileQualified();
+  auto order = cir::MemOrder::SequentiallyConsistent;
+  if (!dest.getType()->isAtomicType()) {
+    assert(!cir::MissingFeatures::atomicMicrosoftVolatile());
+  }
+  return emitAtomicStore(rvalue, dest, order, isVolatile, isInit);
+}
+
+/// Emit a store to an l-value of atomic type.
+///
+/// Note that the r-value is expected to be an r-value of the atomic type; this
+/// means that for aggregate r-values, it should include storage for any padding
+/// that was necessary.
+void CIRGenFunction::emitAtomicStore(RValue rvalue, LValue dest,
+                                     cir::MemOrder order, bool isVolatile,
+                                     bool isInit) {
+  // If this is an aggregate r-value, it should agree in type except
+  // maybe for address-space qualification.
+  mlir::Location loc = dest.getPointer().getLoc();
+  assert(!rvalue.isAggregate() ||
+         rvalue.getAggregateAddress().getElementType() ==
+             dest.getAddress().getElementType());
+
+  AtomicInfo atomics(*this, dest, loc);
+  LValue lvalue = atomics.getAtomicLValue();
+
+  if (lvalue.isSimple()) {
+    // If this is an initialization, just put the value there normally.
+    if (isInit) {
+      atomics.emitCopyIntoMemory(rvalue);
+      return;
+    }
+
+    // Check whether we should use a library call.
+    if (atomics.shouldUseLibCall()) {
+      assert(!cir::MissingFeatures::atomicUseLibCall());
+      cgm.errorNYI(loc, "emitAtomicStore: atomic store with library call");
+      return;
+    }
+
+    // Okay, we're doing this natively.
+    mlir::Value valueToStore = atomics.convertRValueToInt(rvalue);
+
+    // Do the atomic store.
+    Address addr = atomics.getAtomicAddress();
+    if (mlir::Value value = atomics.getScalarRValValueOrNull(rvalue)) {
+      if (shouldCastToInt(value.getType(), /*CmpXchg=*/false)) {
+        addr = atomics.castToAtomicIntPointer(addr);
+        valueToStore =
+            builder.createIntCast(valueToStore, addr.getElementType());
+      }
+    }
+    cir::StoreOp store = builder.createStore(loc, valueToStore, addr);
+
+    // Initializations don't need to be atomic.
+    if (!isInit) {
+      assert(!cir::MissingFeatures::atomicOpenMP());
+      store.setMemOrder(order);
+    }
+
+    // Other decoration.
+    if (isVolatile)
+      store.setIsVolatile(true);
+
+    assert(!cir::MissingFeatures::opLoadStoreTbaa());
+    return;
+  }
+
+  cgm.errorNYI(loc, "emitAtomicStore: non-simple atomic lvalue");
+  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+}
+
 void CIRGenFunction::emitAtomicInit(Expr *init, LValue dest) {
   AtomicInfo atomics(*this, dest, getLoc(init->getSourceRange()));
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index e35100ffe4b6b..0803910f2e83a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -211,6 +211,28 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     assert(!cir::MissingFeatures::fastMathFlags());
     return emitUnaryMaybeConstrainedFPBuiltin<cir::CosOp>(*this, *e);
 
+  case Builtin::BIceil:
+  case Builtin::BIceilf:
+  case Builtin::BIceill:
+  case Builtin::BI__builtin_ceil:
+  case Builtin::BI__builtin_ceilf:
+  case Builtin::BI__builtin_ceilf16:
+  case Builtin::BI__builtin_ceill:
+  case Builtin::BI__builtin_ceilf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::CeilOp>(*this, *e);
+
+  case Builtin::BIexp:
+  case Builtin::BIexpf:
+  case Builtin::BIexpl:
+  case Builtin::BI__builtin_exp:
+  case Builtin::BI__builtin_expf:
+  case Builtin::BI__builtin_expf16:
+  case Builtin::BI__builtin_expl:
+  case Builtin::BI__builtin_expf128:
+    assert(!cir::MissingFeatures::fastMathFlags());
+    return emitUnaryMaybeConstrainedFPBuiltin<cir::ExpOp>(*this, *e);
+
   case Builtin::BIfabs:
   case Builtin::BIfabsf:
   case Builtin::BIfabsl:
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
index 851328a7db680..437db306f3369 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
@@ -147,8 +147,8 @@ void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) {
 
   assert(!cir::MissingFeatures::innermostEHScope());
 
-  EHCleanupScope *scope = new (buffer)
-      EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup);
+  EHCleanupScope *scope = new (buffer) EHCleanupScope(
+      size, branchFixups.size(), innermostNormalCleanup, innermostEHScope);
 
   if (isNormalCleanup)
     innermostNormalCleanup = stable_begin();
@@ -191,7 +191,9 @@ void EHScopeStack::popCleanup() {
 EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) {
   char *buffer = allocate(EHCatchScope::getSizeForNumHandlers(numHandlers));
   assert(!cir::MissingFeatures::innermostEHScope());
-  EHCatchScope *scope = new (buffer) EHCatchScope(numHandlers);
+  EHCatchScope *scope =
+      new (buffer) EHCatchScope(numHandlers, innermostEHScope);
+  innermostEHScope = stable_begin();
   return scope;
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
index 61a09a59b05c0..a035d792ef6d1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
@@ -30,6 +30,8 @@ struct CatchTypeInfo {
 
 /// A protected scope for zero-cost EH handling.
 class EHScope {
+  EHScopeStack::stable_iterator enclosingEHScope;
+
   class CommonBitFields {
     friend class EHScope;
     unsigned kind : 3;
@@ -79,7 +81,10 @@ class EHScope {
 public:
   enum Kind { Cleanup, Catch, Terminate, Filter };
 
-  EHScope(Kind kind) { commonBits.kind = kind; }
+  EHScope(Kind kind, EHScopeStack::stable_iterator enclosingEHScope)
+      : enclosingEHScope(enclosingEHScope) {
+    commonBits.kind = kind;
+  }
 
   Kind getKind() const { return static_cast<Kind>(commonBits.kind); }
 
@@ -90,6 +95,10 @@ class EHScope {
     assert(!cir::MissingFeatures::ehstackBranches());
     return false;
   }
+
+  EHScopeStack::stable_iterator getEnclosingEHScope() const {
+    return enclosingEHScope;
+  }
 };
 
 /// A scope which attempts to handle some, possibly all, types of
@@ -111,6 +120,8 @@ class EHCatchScope : public EHScope {
 
     /// The catch handler for this type.
     mlir::Region *region;
+
+    bool isCatchAll() const { return type.rtti == nullptr; }
   };
 
 private:
@@ -118,12 +129,18 @@ class EHCatchScope : public EHScope {
 
   Handler *getHandlers() { return reinterpret_cast<Handler *>(this + 1); }
 
+  const Handler *getHandlers() const {
+    return reinterpret_cast<const Handler *>(this + 1);
+  }
+
 public:
   static size_t getSizeForNumHandlers(unsigned n) {
     return sizeof(EHCatchScope) + n * sizeof(Handler);
   }
 
-  EHCatchScope(unsigned numHandlers) : EHScope(Catch) {
+  EHCatchScope(unsigned numHandlers,
+               EHScopeStack::stable_iterator enclosingEHScope)
+      : EHScope(Catch, enclosingEHScope) {
     catchBits.numHandlers = numHandlers;
     assert(catchBits.numHandlers == numHandlers && "NumHandlers overflow?");
   }
@@ -136,6 +153,11 @@ class EHCatchScope : public EHScope {
     getHandlers()[i].region = region;
   }
 
+  const Handler &getHandler(unsigned i) const {
+    assert(i < getNumHandlers());
+    return getHandlers()[i];
+  }
+
   // Clear all handler blocks.
   // FIXME: it's better to always call clearHandlerBlocks in DTOR and have a
   // 'takeHandler' or some such function which removes ownership from the
@@ -144,6 +166,10 @@ class EHCatchScope : public EHScope {
     // The blocks are owned by TryOp, nothing to delete.
   }
 
+  using iterator = const Handler *;
+  iterator begin() const { return getHandlers(); }
+  iterator end() const { return getHandlers() + getNumHandlers(); }
+
   static bool classof(const EHScope *scope) {
     return scope->getKind() == Catch;
   }
@@ -176,9 +202,10 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope
   }
 
   EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth,
-                 EHScopeStack::stable_iterator enclosingNormal)
-      : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal),
-        fixupDepth(fixupDepth) {
+                 EHScopeStack::stable_iterator enclosingNormal,
+                 EHScopeStack::stable_iterator enclosingEH)
+      : EHScope(EHScope::Cleanup, enclosingEH),
+        enclosingNormal(enclosingNormal), fixupDepth(fixupDepth) {
     // TODO(cir): When exception handling is upstreamed, isNormalCleanup and
     // isEHCleanup will be arguments to the constructor.
     cleanupBits.isNormalCleanup = true;
@@ -235,13 +262,45 @@ class EHScopeStack::iterator {
 
   EHScope *get() const { return reinterpret_cast<EHScope *>(ptr); }
 
+  EHScope *operator->() const { return get(); }
   EHScope &operator*() const { return *get(); }
+
+  iterator &operator++() {
+    size_t size;
+    switch (get()->getKind()) {
+    case EHScope::Catch:
+      size = EHCatchScope::getSizeForNumHandlers(
+          static_cast<const EHCatchScope *>(get())->getNumHandlers());
+      break;
+
+    case EHScope::Filter:
+      llvm_unreachable("EHScopeStack::iterator Filter");
+      break;
+
+    case EHScope::Cleanup:
+      llvm_unreachable("EHScopeStack::iterator Cleanup");
+      break;
+
+    case EHScope::Terminate:
+      llvm_unreachable("EHScopeStack::iterator Terminate");
+      break;
+    }
+    ptr += llvm::alignTo(size, ScopeStackAlignment);
+    return *this;
+  }
+
+  bool operator==(iterator other) const { return ptr == other.ptr; }
+  bool operator!=(iterator other) const { return ptr != other.ptr; }
 };
 
 inline EHScopeStack::iterator EHScopeStack::begin() const {
   return iterator(startOfData);
 }
 
+inline EHScopeStack::iterator EHScopeStack::end() const {
+  return iterator(endOfBuffer);
+}
+
 inline EHScopeStack::iterator
 EHScopeStack::find(stable_iterator savePoint) const {
   assert(savePoint.isValid() && "finding invalid savepoint");
@@ -254,7 +313,7 @@ inline void EHScopeStack::popCatch() {
   assert(!empty() && "popping exception stack when not empty");
 
   EHCatchScope &scope = llvm::cast<EHCatchScope>(*begin());
-  assert(!cir::MissingFeatures::innermostEHScope());
+  innermostEHScope = scope.getEnclosingEHScope();
   deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers()));
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 5ccb431e626ae..422fa1cf5ad2e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -311,7 +311,8 @@ static LValue emitGlobalVarDeclLValue(CIRGenFunction &cgf, const Expr *e,
 
 void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
                                        bool isVolatile, QualType ty,
-                                       bool isInit, bool isNontemporal) {
+                                       LValueBaseInfo baseInfo, bool isInit,
+                                       bool isNontemporal) {
   assert(!cir::MissingFeatures::opLoadStoreThreadLocal());
 
   if (const auto *clangVecTy = ty->getAs<clang::VectorType>()) {
@@ -333,7 +334,13 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, Address addr,
 
   value = emitToMemory(value, ty);
 
-  assert(!cir::MissingFeatures::opLoadStoreAtomic());
+  assert(!cir::MissingFeatures::opLoadStoreTbaa());
+  LValue atomicLValue = LValue::makeAddr(addr, ty, baseInfo);
+  if (ty->isAtomicType() ||
+      (!isInit && isLValueSuitableForInlineAtomic(atomicLValue))) {
+    emitAtomicStore(RValue::get(value), atomicLValue, isInit);
+    return;
+  }
 
   // Update the alloca with more info on initialization.
   assert(addr.getPointer() && "expected pointer to exist");
@@ -550,7 +557,8 @@ void CIRGenFunction::emitStoreOfScalar(mlir::Value value, LValue lvalue,
   }
 
   emitStoreOfScalar(value, lvalue.getAddress(), lvalue.isVolatile(),
-                    lvalue.getType(), isInit, /*isNontemporal=*/false);
+                    lvalue.getType(), lvalue.getBaseInfo(), isInit,
+                    /*isNontemporal=*/false);
 }
 
 mlir::Value CIRGenFunction::emitLoadOfScalar(Address addr, bool isVolatile,
@@ -1630,7 +1638,7 @@ RValue CIRGenFunction::emitAnyExpr(const Expr *e, AggValueSlot aggSlot,
                                    bool ignoreResult) {
   switch (CIRGenFunction::getEvaluationKind(e->getType())) {
   case cir::TEK_Scalar:
-    return RValue::get(emitScalarExpr(e));
+    return RValue::get(emitScalarExpr(e, ignoreResult));
   case cir::TEK_Complex:
     return RValue::getComplex(emitComplexExpr(e));
   case cir::TEK_Aggregate: {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 3d3030ca87e2a..201fb73983155 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -343,8 +343,8 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitNoInitExpr");
   }
   void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
-    cgf.cgm.errorNYI(dae->getSourceRange(),
-                     "AggExprEmitter: VisitCXXDefaultArgExpr");
+    CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
+    Visit(dae->getExpr());
   }
   void VisitCXXInheritedCtorInitExpr(const CXXInheritedCtorInitExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 119314fe27dce..5eba5ba6c3df1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -78,11 +78,15 @@ struct BinOpInfo {
 class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   CIRGenFunction &cgf;
   CIRGenBuilderTy &builder;
+  // Unlike classic codegen we set this to false or use std::exchange to read
+  // the value instead of calling TestAndClearIgnoreResultAssign to make it
+  // explicit when the value is used
   bool ignoreResultAssign;
 
 public:
-  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder)
-      : cgf(cgf), builder(builder) {}
+  ScalarExprEmitter(CIRGenFunction &cgf, CIRGenBuilderTy &builder,
+                    bool ignoreResultAssign = false)
+      : cgf(cgf), builder(builder), ignoreResultAssign(ignoreResultAssign) {}
 
   //===--------------------------------------------------------------------===//
   //                               Utilities
@@ -221,6 +225,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   }
 
   mlir::Value VisitArraySubscriptExpr(ArraySubscriptExpr *e) {
+    ignoreResultAssign = false;
+
     if (e->getBase()->getType()->isVectorType()) {
       assert(!cir::MissingFeatures::scalableVectors());
 
@@ -839,6 +845,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   BinOpInfo emitBinOps(const BinaryOperator *e,
                        QualType promotionType = QualType()) {
+    ignoreResultAssign = false;
     BinOpInfo result;
     result.lhs = cgf.emitPromotedScalarExpr(e->getLHS(), promotionType);
     result.rhs = cgf.emitPromotedScalarExpr(e->getRHS(), promotionType);
@@ -924,6 +931,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 #undef HANDLEBINOP
 
   mlir::Value emitCmp(const BinaryOperator *e) {
+    ignoreResultAssign = false;
     const mlir::Location loc = cgf.getLoc(e->getExprLoc());
     mlir::Value result;
     QualType lhsTy = e->getLHS()->getType();
@@ -1406,11 +1414,13 @@ CIRGenFunction::emitCompoundAssignmentLValue(const CompoundAssignOperator *e) {
 }
 
 /// Emit the computation of the specified expression of scalar type.
-mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e) {
+mlir::Value CIRGenFunction::emitScalarExpr(const Expr *e,
+                                           bool ignoreResultAssign) {
   assert(e && hasScalarEvaluationKind(e->getType()) &&
          "Invalid scalar expression to emit");
 
-  return ScalarExprEmitter(*this, builder).Visit(const_cast<Expr *>(e));
+  return ScalarExprEmitter(*this, builder, ignoreResultAssign)
+      .Visit(const_cast<Expr *>(e));
 }
 
 mlir::Value CIRGenFunction::emitPromotedScalarExpr(const Expr *e,
@@ -2054,6 +2064,11 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) {
 mlir::Value ScalarExprEmitter::VisitInitListExpr(InitListExpr *e) {
   const unsigned numInitElements = e->getNumInits();
 
+  [[maybe_unused]] const bool ignore = std::exchange(ignoreResultAssign, false);
+  assert((ignore == false ||
+          (numInitElements == 0 && e->getType()->isVoidType())) &&
+         "init list ignored");
+
   if (e->hadArrayRangeDesignator()) {
     cgf.cgm.errorNYI(e->getSourceRange(), "ArrayRangeDesignator");
     return {};
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index e5cecaa573a6e..1c52a78d72e33 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1271,6 +1271,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   RValue emitAtomicExpr(AtomicExpr *e);
   void emitAtomicInit(Expr *init, LValue dest);
+  void emitAtomicStore(RValue rvalue, LValue dest, bool isInit);
+  void emitAtomicStore(RValue rvalue, LValue dest, cir::MemOrder order,
+                       bool isVolatile, bool isInit);
 
   AutoVarEmission emitAutoVarAlloca(const clang::VarDecl &d,
                                     mlir::OpBuilder::InsertPoint ip = {});
@@ -1501,7 +1504,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                               llvm::ArrayRef<mlir::Value> args = {});
 
   /// Emit the computation of the specified expression of scalar type.
-  mlir::Value emitScalarExpr(const clang::Expr *e);
+  mlir::Value emitScalarExpr(const clang::Expr *e,
+                             bool ignoreResultAssign = false);
 
   mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
                                       cir::UnaryOpKind kind, bool isPre);
@@ -1679,8 +1683,8 @@ class CIRGenFunction : public CIRGenTypeCache {
                           bool isInit);
 
   void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile,
-                         clang::QualType ty, bool isInit = false,
-                         bool isNontemporal = false);
+                         clang::QualType ty, LValueBaseInfo baseInfo,
+                         bool isInit = false, bool isNontemporal = false);
   void emitStoreOfScalar(mlir::Value value, LValue lvalue, bool isInit);
 
   /// Store the specified rvalue into the specified
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 4198c23c9cbed..9005b0106b2a4 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -155,6 +155,9 @@ class EHScopeStack {
   /// The innermost normal cleanup on the stack.
   stable_iterator innermostNormalCleanup = stable_end();
 
+  /// The innermost EH scope on the stack.
+  stable_iterator innermostEHScope = stable_end();
+
   /// The CGF this Stack belong to
   CIRGenFunction *cgf = nullptr;
 
@@ -226,6 +229,8 @@ class EHScopeStack {
   }
   stable_iterator getInnermostActiveNormalCleanup() const;
 
+  stable_iterator getInnermostEHScope() const { return innermostEHScope; }
+
   /// An unstable reference to a scope-stack depth.  Invalidated by
   /// pushes but not pops.
   class iterator;
@@ -233,6 +238,9 @@ class EHScopeStack {
   /// Returns an iterator pointing to the innermost EH scope.
   iterator begin() const;
 
+  /// Returns an iterator pointing to the outermost EH scope.
+  iterator end() const;
+
   /// Create a stable reference to the top of the EH stack.  The
   /// returned reference is valid until that scope is popped off the
   /// stack.
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 5a6193fa8d840..ba967a43ce59a 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -194,6 +194,14 @@ mlir::LogicalResult CIRToLLVMCosOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMExpOpLowering::matchAndRewrite(
+    cir::ExpOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resTy = typeConverter->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::ExpOp>(op, resTy, adaptor.getSrc());
+  return mlir::success();
+}
+
 static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter,
                                   mlir::Value llvmSrc, mlir::Type llvmDstIntTy,
                                   bool isUnsigned, uint64_t cirSrcWidth,
@@ -1336,6 +1344,14 @@ mlir::LogicalResult CIRToLLVMATanOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMCeilOpLowering::matchAndRewrite(
+    cir::CeilOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type resTy = typeConverter->convertType(op.getType());
+  rewriter.replaceOpWithNewOp<mlir::LLVM::FCeilOp>(op, resTy, adaptor.getSrc());
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite(
     cir::AllocaOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index fbf4a5722caed..b6928ce7d9c44 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -160,6 +160,57 @@ static Value *handleHlslSplitdouble(const CallExpr *E, CodeGenFunction *CGF) {
   return LastInst;
 }
 
+static Value *handleElementwiseF16ToF32(CodeGenFunction &CGF,
+                                        const CallExpr *E) {
+  Value *Op0 = CGF.EmitScalarExpr(E->getArg(0));
+  QualType Op0Ty = E->getArg(0)->getType();
+  llvm::Type *ResType = CGF.FloatTy;
+  uint64_t NumElements = 0;
+  if (Op0->getType()->isVectorTy()) {
+    NumElements =
+        E->getArg(0)->getType()->castAs<clang::VectorType>()->getNumElements();
+    ResType =
+        llvm::VectorType::get(ResType, ElementCount::getFixed(NumElements));
+  }
+  if (!Op0Ty->hasUnsignedIntegerRepresentation())
+    llvm_unreachable(
+        "f16tof32 operand must have an unsigned int representation");
+
+  if (CGF.CGM.getTriple().isDXIL())
+    return CGF.Builder.CreateIntrinsic(ResType, Intrinsic::dx_legacyf16tof32,
+                                       ArrayRef<Value *>{Op0}, nullptr,
+                                       "hlsl.f16tof32");
+
+  if (CGF.CGM.getTriple().isSPIRV()) {
+    // We use the SPIRV UnpackHalf2x16 operation to avoid the need for the
+    // Int16 and Float16 capabilities
+    auto UnpackType =
+        llvm::VectorType::get(CGF.FloatTy, ElementCount::getFixed(2));
+    if (NumElements == 0) {
+      // a scalar input - simply extract the first element of the unpacked
+      // vector
+      Value *Unpack = CGF.Builder.CreateIntrinsic(
+          UnpackType, Intrinsic::spv_unpackhalf2x16, ArrayRef<Value *>{Op0});
+      return CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0);
+    } else {
+      // a vector input - build a congruent output vector by iterating through
+      // the input vector calling unpackhalf2x16 for each element
+      Value *Result = PoisonValue::get(ResType);
+      for (uint64_t i = 0; i < NumElements; i++) {
+        Value *InVal = CGF.Builder.CreateExtractElement(Op0, i);
+        Value *Unpack = CGF.Builder.CreateIntrinsic(
+            UnpackType, Intrinsic::spv_unpackhalf2x16,
+            ArrayRef<Value *>{InVal});
+        Value *Res = CGF.Builder.CreateExtractElement(Unpack, (uint64_t)0);
+        Result = CGF.Builder.CreateInsertElement(Result, Res, i);
+      }
+      return Result;
+    }
+  }
+
+  llvm_unreachable("Intrinsic F16ToF32 not supported by target architecture");
+}
+
 static Value *emitBufferStride(CodeGenFunction *CGF, const Expr *HandleExpr,
                                LValue &Stride) {
   // Figure out the stride of the buffer elements from the handle type.
@@ -579,6 +630,9 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
         /*ReturnType=*/X->getType(), CGM.getHLSLRuntime().getDegreesIntrinsic(),
         ArrayRef<Value *>{X}, nullptr, "hlsl.degrees");
   }
+  case Builtin::BI__builtin_hlsl_elementwise_f16tof32: {
+    return handleElementwiseF16ToF32(*this, E);
+  }
   case Builtin::BI__builtin_hlsl_elementwise_frac: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     if (!E->getArg(0)->getType()->hasFloatingRepresentation())
diff --git a/clang/lib/CodeGen/ModuleBuilder.cpp b/clang/lib/CodeGen/ModuleBuilder.cpp
index 96f3f6221e20f..8ec8aef311656 100644
--- a/clang/lib/CodeGen/ModuleBuilder.cpp
+++ b/clang/lib/CodeGen/ModuleBuilder.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/VirtualFileSystem.h"
 #include <memory>
 
@@ -378,3 +379,31 @@ clang::CreateLLVMCodeGen(DiagnosticsEngine &Diags, llvm::StringRef ModuleName,
                                HeaderSearchOpts, PreprocessorOpts, CGO, C,
                                CoverageInfo);
 }
+
+namespace clang {
+namespace CodeGen {
+std::optional<std::pair<StringRef, StringRef>>
+DemangleTrapReasonInDebugInfo(StringRef FuncName) {
+  static auto TrapRegex =
+      llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str());
+  llvm::SmallVector<llvm::StringRef, 3> Matches;
+  std::string *ErrorPtr = nullptr;
+#ifndef NDEBUG
+  std::string Error;
+  ErrorPtr = &Error;
+#endif
+  if (!TrapRegex.match(FuncName, &Matches, ErrorPtr)) {
+    assert(ErrorPtr && ErrorPtr->empty() && "Invalid regex pattern");
+    return {};
+  }
+
+  if (Matches.size() != 3) {
+    assert(0 && "Expected 3 matches from Regex::match");
+    return {};
+  }
+
+  // Returns { Trap Category, Trap Message }
+  return std::make_pair(Matches[1], Matches[2]);
+}
+} // namespace CodeGen
+} // namespace clang
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 51618d17a4180..a0b82cec9a372 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -3857,6 +3857,9 @@ class OffloadingActionBuilder final {
   /// Flag set to true if all valid builders allow file bundling/unbundling.
   bool CanUseBundler;
 
+  /// Flag set to false if an argument turns off bundling.
+  bool ShouldUseBundler;
+
 public:
   OffloadingActionBuilder(Compilation &C, DerivedArgList &Args,
                           const Driver::InputList &Inputs)
@@ -3891,6 +3894,9 @@ class OffloadingActionBuilder final {
     }
     CanUseBundler =
         ValidBuilders && ValidBuilders == ValidBuildersSupportingBundling;
+
+    ShouldUseBundler = Args.hasFlag(options::OPT_gpu_bundle_output,
+                                    options::OPT_no_gpu_bundle_output, true);
   }
 
   ~OffloadingActionBuilder() {
@@ -4042,11 +4048,11 @@ class OffloadingActionBuilder final {
       SB->appendTopLevelActions(OffloadAL);
     }
 
-    // If we can use the bundler, replace the host action by the bundling one in
-    // the resulting list. Otherwise, just append the device actions. For
-    // device only compilation, HostAction is a null pointer, therefore only do
-    // this when HostAction is not a null pointer.
-    if (CanUseBundler && HostAction &&
+    // If we can and should use the bundler, replace the host action by the
+    // bundling one in the resulting list. Otherwise, just append the device
+    // actions. For device only compilation, HostAction is a null pointer,
+    // therefore only do this when HostAction is not a null pointer.
+    if (CanUseBundler && ShouldUseBundler && HostAction &&
         HostAction->getType() != types::TY_Nothing && !OffloadAL.empty()) {
       // Add the host action to the list in order to create the bundling action.
       OffloadAL.push_back(HostAction);
@@ -6463,9 +6469,16 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
               (JA.getOffloadingDeviceKind() == Action::OFK_OpenMP && TC &&
                TC->getTriple().isAMDGPU()));
     };
-    if (!AtTopLevel && JA.getType() == types::TY_LLVM_BC &&
-        (C.getArgs().hasArg(options::OPT_emit_llvm) ||
-         IsAMDRDCInCompilePhase(JA, C.getArgs())))
+
+    // The linker wrapper may not support the input and output files to be the
+    // same one, and without it -save-temps can fail.
+    bool IsLinkerWrapper =
+        JA.getType() == types::TY_Object && isa<LinkerWrapperJobAction>(JA);
+    bool IsEmitBitcode = JA.getType() == types::TY_LLVM_BC &&
+                         (C.getArgs().hasArg(options::OPT_emit_llvm) ||
+                          IsAMDRDCInCompilePhase(JA, C.getArgs()));
+
+    if (!AtTopLevel && (IsLinkerWrapper || IsEmitBitcode))
       Suffixed += ".tmp";
     Suffixed += '.';
     Suffixed += Suffix;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index d3ab6f1261ad6..30d3e5293a31b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7879,10 +7879,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                        !TC.getTriple().isAndroid() && TC.useIntegratedAs()))
     CmdArgs.push_back("-faddrsig");
 
-  if ((Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) &&
+  const bool HasDefaultDwarf2CFIASM =
+      (Triple.isOSBinFormatELF() || Triple.isOSBinFormatMachO()) &&
       (EH || UnwindTables || AsyncUnwindTables ||
-       DebugInfoKind != llvm::codegenoptions::NoDebugInfo))
-    CmdArgs.push_back("-D__GCC_HAVE_DWARF2_CFI_ASM=1");
+       DebugInfoKind != llvm::codegenoptions::NoDebugInfo);
+  if (Args.hasFlag(options::OPT_fdwarf2_cfi_asm,
+                   options::OPT_fno_dwarf2_cfi_asm, HasDefaultDwarf2CFIASM))
+    CmdArgs.push_back("-fdwarf2-cfi-asm");
 
   if (Arg *A = Args.getLastArg(options::OPT_fsymbol_partition_EQ)) {
     std::string Str = A->getAsString(Args);
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index f24b8ab14bdce..406c77cb3ae8f 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -591,7 +591,8 @@ static unsigned AlignTokens(const FormatStyle &Style, F &&Matches,
       CurrentChangeWidthRight = CurrentChange.TokenLength;
     const FormatToken *MatchingParenToEncounter = nullptr;
     for (unsigned J = I + 1;
-         J != E && (Changes[J].NewlinesBefore == 0 || MatchingParenToEncounter);
+         J != E && (Changes[J].NewlinesBefore == 0 ||
+                    MatchingParenToEncounter || Changes[J].IsAligned);
          ++J) {
       const auto &Change = Changes[J];
       const auto *Tok = Change.Tok;
diff --git a/clang/lib/Frontend/DependencyFile.cpp b/clang/lib/Frontend/DependencyFile.cpp
index 15fa7de35df97..93e012b163878 100644
--- a/clang/lib/Frontend/DependencyFile.cpp
+++ b/clang/lib/Frontend/DependencyFile.cpp
@@ -75,6 +75,17 @@ struct DepCollectorPPCallbacks : public PPCallbacks {
                                     /*IsMissing*/ false);
   }
 
+  bool EmbedFileNotFound(StringRef FileName) override {
+    DepCollector.maybeAddDependency(
+        llvm::sys::path::remove_leading_dotslash(FileName),
+        /*FromModule=*/false,
+        /*IsSystem=*/false,
+        /*IsModuleFile=*/false,
+        /*IsMissing=*/true);
+    // Return true to silence the file not found diagnostic.
+    return true;
+  }
+
   void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok,
                           StringRef FileName, bool IsAngled,
                           CharSourceRange FilenameRange,
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index ed3f1f93d25d3..b88d9f89c5f71 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1516,6 +1516,9 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   if (LangOpts.PointerAuthIntrinsics)
     Builder.defineMacro("__PTRAUTH__");
 
+  if (CGOpts.Dwarf2CFIAsm)
+    Builder.defineMacro("__GCC_HAVE_DWARF2_CFI_ASM");
+
   // Get other target #defines.
   TI.getTargetDefines(LangOpts, Builder);
 }
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index aea3e72d92a84..10032184b5d94 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -349,14 +349,13 @@ struct SourceColumnMap {
 
 /// When the source code line we want to print is too long for
 /// the terminal, select the "interesting" region.
-static void selectInterestingSourceRegion(std::string &SourceLine,
-                                          std::string &CaretLine,
-                                          std::string &FixItInsertionLine,
-                                          Columns NonGutterColumns,
-                                          const SourceColumnMap &Map) {
-  Columns CaretColumns = Columns(CaretLine.size());
-  Columns FixItColumns =
-      Columns(llvm::sys::locale::columnWidth(FixItInsertionLine));
+static void selectInterestingSourceRegion(
+    std::string &SourceLine, std::string &CaretLine,
+    std::string &FixItInsertionLine, Columns NonGutterColumns,
+    const SourceColumnMap &Map,
+    SmallVectorImpl<clang::TextDiagnostic::StyleRange> &Styles) {
+  Columns CaretColumns = CaretLine.size();
+  Columns FixItColumns = llvm::sys::locale::columnWidth(FixItInsertionLine);
   Columns MaxColumns =
       std::max({Map.columns().V, CaretColumns.V, FixItColumns.V});
   // if the number of columns is less than the desired number we're done
@@ -369,13 +368,11 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
   // Find the slice that we need to display the full caret line
   // correctly.
   Columns CaretStart = 0, CaretEnd = CaretLine.size();
-  for (; CaretStart != CaretEnd; CaretStart = CaretStart.next())
-    if (!isWhitespace(CaretLine[CaretStart.V]))
-      break;
+  while (CaretStart != CaretEnd && isWhitespace(CaretLine[CaretStart.V]))
+    CaretStart = CaretStart.next();
 
-  for (; CaretEnd != CaretStart; CaretEnd = CaretEnd.prev())
-    if (!isWhitespace(CaretLine[CaretEnd.V - 1]))
-      break;
+  while (CaretEnd != CaretStart && isWhitespace(CaretLine[CaretEnd.V]))
+    CaretEnd = CaretEnd.prev();
 
   // caret has already been inserted into CaretLine so the above whitespace
   // check is guaranteed to include the caret
@@ -516,13 +513,45 @@ static void selectInterestingSourceRegion(std::string &SourceLine,
   assert(FrontColumnsRemoved + ColumnsKept + BackColumnsRemoved >
          NonGutterColumns);
 
+  // Since we've modified the SourceLine, we also need to adjust the line's
+  // highlighting information. In particular, if we've removed
+  // from the front of the line, we need to move the style ranges to the
+  // left and remove unneeded ranges.
+  // Note in particular that variables like CaretEnd are defined in the
+  // CaretLine, which only contains ASCII, while the style ranges are defined in
+  // the source line, where we have to care for the byte-index != column-index
+  // case.
+  Bytes BytesRemoved =
+      FrontColumnsRemoved > FrontEllipse.size()
+          ? (Map.columnToByte(FrontColumnsRemoved) - Bytes(FrontEllipse.size()))
+          : 0;
+  Bytes CodeEnd =
+      CaretEnd < Map.columns() ? Map.columnToByte(CaretEnd.V) : CaretEnd.V;
+  for (TextDiagnostic::StyleRange &R : Styles) {
+    // Remove style ranges before and after the new truncated snippet.
+    if (R.Start >= static_cast<unsigned>(CodeEnd.V) ||
+        R.End < static_cast<unsigned>(BytesRemoved.V)) {
+      R.Start = R.End = std::numeric_limits<int>::max();
+      continue;
+    }
+    // Move them left. (Note that this can wrap R.Start, but that doesn't
+    // matter).
+    R.Start -= BytesRemoved.V;
+    R.End -= BytesRemoved.V;
+
+    // Don't leak into the ellipse at the end.
+    if (R.Start < static_cast<unsigned>(CodeEnd.V) &&
+        R.End > static_cast<unsigned>(CodeEnd.V))
+      R.End = CodeEnd.V + 1; // R.End is inclusive.
+  }
+
   // The line needs some truncation, and we'd prefer to keep the front
   //  if possible, so remove the back
   if (BackColumnsRemoved > Columns(BackEllipse.size()))
     SourceLine.replace(SourceEnd.V, std::string::npos, BackEllipse);
 
   // If that's enough then we're done
-  if (FrontColumnsRemoved + ColumnsKept <= Columns(NonGutterColumns))
+  if (FrontColumnsRemoved + ColumnsKept <= NonGutterColumns)
     return;
 
   // Otherwise remove the front as well
@@ -1391,6 +1420,11 @@ void TextDiagnostic::emitSnippetAndCaret(
       OS.indent(MaxLineNoDisplayWidth + 2) << "| ";
   };
 
+  Columns MessageLength = DiagOpts.MessageLength;
+  // If we don't have enough columns available, just abort now.
+  if (MessageLength != 0 && MessageLength <= Columns(MaxLineNoDisplayWidth + 4))
+    return;
+
   // Prepare source highlighting information for the lines we're about to
   // emit, starting from the first line.
   std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles =
@@ -1450,10 +1484,14 @@ void TextDiagnostic::emitSnippetAndCaret(
 
     // If the source line is too long for our terminal, select only the
     // "interesting" source region within that line.
-    Columns MessageLength = DiagOpts.MessageLength;
-    if (MessageLength.V != 0)
+    if (MessageLength != 0) {
+      Columns NonGutterColumns = MessageLength;
+      if (MaxLineNoDisplayWidth != 0)
+        NonGutterColumns -= Columns(MaxLineNoDisplayWidth + 4);
       selectInterestingSourceRegion(SourceLine, CaretLine, FixItInsertionLine,
-                                    MessageLength, SourceColMap);
+                                    NonGutterColumns, SourceColMap,
+                                    SourceStyles[LineNo - Lines.first]);
+    }
 
     // If we are in -fdiagnostics-print-source-range-info mode, we are trying
     // to produce easily machine parsable output.  Add a space before the
@@ -1508,7 +1546,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
   // Print the source line one character at a time.
   bool PrintReversed = false;
   std::optional<llvm::raw_ostream::Colors> CurrentColor;
-  size_t I = 0;
+  size_t I = 0; // Bytes.
   while (I < SourceLine.size()) {
     auto [Str, WasPrintable] =
         printableTextForNextCharacter(SourceLine, &I, DiagOpts.TabStop);
diff --git a/clang/lib/Headers/avx10_2_512bf16intrin.h b/clang/lib/Headers/avx10_2_512bf16intrin.h
index 37ebc4f46a826..46ec12a63ef9c 100644
--- a/clang/lib/Headers/avx10_2_512bf16intrin.h
+++ b/clang/lib/Headers/avx10_2_512bf16intrin.h
@@ -24,6 +24,12 @@ typedef __bf16 __m512bh_u __attribute__((__vector_size__(64), __aligned__(1)));
   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"),        \
                  __min_vector_width__(512)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr
+#else
+#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512
+#endif
+
 static __inline __m512bh __DEFAULT_FN_ATTRS512 _mm512_setzero_pbh(void) {
   return __builtin_bit_cast(__m512bh, _mm512_setzero_ps());
 }
@@ -167,7 +173,7 @@ _mm512_mask_blend_pbh(__mmask32 __U, __m512bh __A, __m512bh __W) {
                                                 (__v32bf)__A);
 }
 
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutex2var_pbh(__m512bh __A, __m512i __I, __m512bh __B) {
   return (__m512bh)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                   (__v32hi)__B);
@@ -555,6 +561,7 @@ static __inline__ __m512bh __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pbh(
       (__v32bf)_mm512_setzero_pbh());
 }
 
+#undef __DEFAULT_FN_ATTRS512_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS512
 
 #endif
diff --git a/clang/lib/Headers/avx10_2bf16intrin.h b/clang/lib/Headers/avx10_2bf16intrin.h
index 765cd682986b4..8fb8cd7cd0865 100644
--- a/clang/lib/Headers/avx10_2bf16intrin.h
+++ b/clang/lib/Headers/avx10_2bf16intrin.h
@@ -27,6 +27,14 @@ typedef __bf16 __m256bh_u __attribute__((__vector_size__(32), __aligned__(1)));
   __attribute__((__always_inline__, __nodebug__, __target__("avx10.2"),        \
                  __min_vector_width__(128)))
 
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
 static __inline __m256bh __DEFAULT_FN_ATTRS256 _mm256_setzero_pbh(void) {
   return __builtin_bit_cast(__m256bh, _mm256_setzero_ps());
 }
@@ -287,13 +295,13 @@ _mm256_mask_blend_pbh(__mmask16 __U, __m256bh __A, __m256bh __W) {
                                                 (__v16bf)__A);
 }
 
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutex2var_pbh(__m128bh __A, __m128i __I, __m128bh __B) {
   return (__m128bh)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                   (__v8hi)__B);
 }
 
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutex2var_pbh(__m256bh __A, __m256i __I, __m256bh __B) {
   return (__m256bh)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                   (__v16hi)__B);
@@ -1080,6 +1088,7 @@ _mm_maskz_fnmsub_pbh(__mmask8 __U, __m128bh __A, __m128bh __B, __m128bh __C) {
 
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
-
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 #endif
 #endif
diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h
index ac75b6ccde735..aab1f2b61ab8a 100644
--- a/clang/lib/Headers/avx512bwintrin.h
+++ b/clang/lib/Headers/avx512bwintrin.h
@@ -969,35 +969,31 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
                                         (__v32hi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                  (__v32hi)__B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectw_512(__U,
                               (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
                               (__v32hi)_mm512_setzero_si512());
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 18c4a44a4c76e..5fc0afa49ce4c 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -3059,69 +3059,61 @@ _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
 
 /* Vector permutations */
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
                                                 (__v16si) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectd_512(__U,
                               (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
                               (__v16si)_mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
-{
+static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
                                                 (__v8di) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
+static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
-                                __m512i __B)
-{
+                                __m512i __B) {
   return (__m512i)__builtin_ia32_selectq_512(__U,
                                (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
                                (__v8di)_mm512_setzero_si512());
@@ -5949,71 +5941,66 @@ _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
                                         (__v16sf)_mm512_setzero_ps());
 }
 
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
-{
+static __inline __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) {
   return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
                                                  (__v8df)__B);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
-{
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I,
+                            __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)__A);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
-                             __m512d __B)
-{
+                             __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)(__m512d)__I);
 }
 
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
+static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
-                             __m512d __B)
-{
+                             __m512d __B) {
   return (__m512d)__builtin_ia32_selectpd_512(__U,
                                   (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
                                   (__v8df)_mm512_setzero_pd());
 }
 
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
-{
+static __inline __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) {
   return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
                                                 (__v16sf) __B);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I,
+                            __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)__A);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U,
+                             __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)(__m512)__I);
 }
 
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
-{
+static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR
+_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I,
+                             __m512 __B) {
   return (__m512)__builtin_ia32_selectps_512(__U,
                                  (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
                                  (__v16sf)_mm512_setzero_ps());
 }
 
-
 #define _mm512_cvtt_roundpd_epu32(A, R) \
   ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
                                               (__v8si)_mm256_undefined_si256(), \
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 142cc079c2c4b..25051228f3e0a 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -3316,13 +3316,13 @@ _mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
                                               (__v32hf)__A);
 }
 
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
+static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
   return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
                                                  (__v32hi)__B);
 }
 
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
+static __inline__ __m512h __DEFAULT_FN_ATTRS512_CONSTEXPR
 _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
   return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
 }
diff --git a/clang/lib/Headers/avx512ifmaintrin.h b/clang/lib/Headers/avx512ifmaintrin.h
index 625a8ff66dc60..f73b607df797f 100644
--- a/clang/lib/Headers/avx512ifmaintrin.h
+++ b/clang/lib/Headers/avx512ifmaintrin.h
@@ -17,9 +17,8 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS                                                     \
-  constexpr                                                                    \
-      __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), \
-                     __min_vector_width__(512)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
+                 __min_vector_width__(512))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"),     \
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index b377c17166ffb..51d5210e5aa5d 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -18,13 +18,13 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS128                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512ifma,avx512vl"),                  \
-                           __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl"),                            \
+                 __min_vector_width__(128))) constexpr
 #define __DEFAULT_FN_ATTRS256                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avx512ifma,avx512vl"),                  \
-                           __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__,                               \
+                 __target__("avx512ifma,avx512vl"),                            \
+                 __min_vector_width__(256))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__,                               \
@@ -34,7 +34,6 @@
   __attribute__((__always_inline__, __nodebug__,                               \
                  __target__("avx512ifma,avx512vl"),                            \
                  __min_vector_width__(256)))
-
 #endif
 
 #if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__))
diff --git a/clang/lib/Headers/avx512vbmiintrin.h b/clang/lib/Headers/avx512vbmiintrin.h
index 964535c4c4900..84fda5c5849e8 100644
--- a/clang/lib/Headers/avx512vbmiintrin.h
+++ b/clang/lib/Headers/avx512vbmiintrin.h
@@ -19,59 +19,57 @@
   __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"),     \
                  __min_vector_width__(512)))
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
-{
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) {
   return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
                                                  (__v64qi) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
-                              __m512i __B)
-{
+                              __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)__A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)__I);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
 _mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
-                               __m512i __B)
-{
+                               __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512(__U,
                                (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
                                (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
-        __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_maskz_permutexvar_epi8(__mmask64 __M, __m512i __A, __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                      (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                      (__v64qi)_mm512_setzero_si512());
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
-             __m512i __B)
-{
+static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm512_mask_permutexvar_epi8(__m512i __W, __mmask64 __M, __m512i __A,
+                             __m512i __B) {
   return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
                                      (__v64qi)_mm512_permutexvar_epi8(__A, __B),
                                      (__v64qi)__W);
@@ -99,8 +97,6 @@ _mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
                                 (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
                                 (__v64qi)_mm512_setzero_si512());
 }
-
-
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS
-
 #endif
diff --git a/clang/lib/Headers/avx512vbmivlintrin.h b/clang/lib/Headers/avx512vbmivlintrin.h
index 4c50be7d9e7e5..58a48dadff863 100644
--- a/clang/lib/Headers/avx512vbmivlintrin.h
+++ b/clang/lib/Headers/avx512vbmivlintrin.h
@@ -24,117 +24,110 @@
                  __target__("avx512vbmi,avx512vl"),                            \
                  __min_vector_width__(256)))
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
-{
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
+#else
+#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
+#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
+#endif
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) {
   return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
                                                  (__v16qi)__I,
                                                  (__v16qi)__B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
-                           __m128i __B)
-{
+                           __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128(__U,
                                   (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
                                   (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
   return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
                                                  (__v32qi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
-                              __m256i __B)
-{
+                              __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)__I);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256(__U,
                                (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
                                (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_permutexvar_epi8(__mmask16 __M, __m128i __A, __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                         (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                         (__v16qi)_mm_setzero_si128());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
-          __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_mask_permutexvar_epi8(__m128i __W, __mmask16 __M, __m128i __A,
+                          __m128i __B) {
   return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
                                         (__v16qi)_mm_permutexvar_epi8(__A, __B),
                                         (__v16qi)__W);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutexvar_epi8(__m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
-        __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_permutexvar_epi8(__mmask32 __M, __m256i __A, __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                      (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                      (__v32qi)_mm256_setzero_si256());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
-             __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_mask_permutexvar_epi8(__m256i __W, __mmask32 __M, __m256i __A,
+                             __m256i __B) {
   return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
                                      (__v32qi)_mm256_permutexvar_epi8(__A, __B),
                                      (__v32qi)__W);
@@ -186,7 +179,8 @@ _mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
                                 (__v32qi)_mm256_setzero_si256());
 }
 
-
+#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
+#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 263a1079b26d5..575c0c8962662 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -1223,69 +1223,61 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
                                            (__v16hi)_mm256_setzero_si256());
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) {
   return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                  (__v8hi) __B);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,
-                            __m128i __B)
-{
+                            __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)__A);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,
-                             __m128i __B)
-{
+                             __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)__I);
 }
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
-            __m128i __B)
-{
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I,
+                             __m128i __B) {
   return (__m128i)__builtin_ia32_selectw_128(__U,
                                   (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
                                   (__v8hi)_mm_setzero_si128());
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) {
   return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                  (__v16hi)__B);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I,
-                               __m256i __B)
-{
+                               __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)__A);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U,
-                                __m256i __B)
-{
+                                __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)__I);
 }
 
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
-                                 __m256i __B)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I,
+                                __m256i __B) {
   return (__m256i)__builtin_ia32_selectw_256(__U,
                               (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
                               (__v16hi)_mm256_setzero_si256());
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 5b2b3f0d0bbd4..885231b030b23 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -2010,24 +2010,24 @@ _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
                                               (__v16hf)__A);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
+static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
   return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
                                                  (__v8hi)__B);
 }
 
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
+static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
   return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
                                                  (__v16hi)__B);
 }
 
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
+static __inline__ __m128h __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_permutexvar_ph(__m128i __A, __m128h __B) {
   return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
 }
 
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
+static __inline__ __m256h __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_permutexvar_ph(__m256i __A, __m256h __B) {
   return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
 }
diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h
index 92bb444aeb5b8..e5249926b934e 100644
--- a/clang/lib/Headers/avx512vlintrin.h
+++ b/clang/lib/Headers/avx512vlintrin.h
@@ -3556,13 +3556,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                                (__v8sf)_mm256_setzero_ps());
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
     return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
                                                   (__v4si)__B);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I,
                               __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3570,7 +3570,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)__A);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3578,7 +3578,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)__I);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectd_128(__U,
@@ -3586,13 +3586,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4si)_mm_setzero_si128());
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
     return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
                                                   (__v8si) __B);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I,
                                  __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3600,7 +3600,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)__A);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3608,7 +3608,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)__I);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectd_256(__U,
@@ -3616,40 +3616,43 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v8si)_mm256_setzero_si256());
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
     return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
                                                    (__v2df)__B);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I,
+                           __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)__A);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U,
+                            __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)(__m128d)__I);
   }
 
-  static __inline__ __m128d __DEFAULT_FN_ATTRS128
-  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
+  static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
+  _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I,
+                            __m128d __B) {
     return (__m128d)__builtin_ia32_selectpd_128(__U,
                                        (__v2df)_mm_permutex2var_pd(__A, __I, __B),
                                        (__v2df)_mm_setzero_pd());
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
     return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
                                                    (__v4df)__B);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I,
                               __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3657,7 +3660,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)__A);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,
                                __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3665,7 +3668,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)(__m256d)__I);
   }
 
-  static __inline__ __m256d __DEFAULT_FN_ATTRS256
+  static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,
                                __m256d __B) {
     return (__m256d)__builtin_ia32_selectpd_256(__U,
@@ -3673,47 +3676,48 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v4df)_mm256_setzero_pd());
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
                                                   (__v4sf)__B);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)__A);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)(__m128)__I);
   }
 
-  static __inline__ __m128 __DEFAULT_FN_ATTRS128
+  static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
     return (__m128)__builtin_ia32_selectps_128(__U,
                                        (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
                                        (__v4sf)_mm_setzero_ps());
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
     return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
                                                   (__v8sf) __B);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
-  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
+  _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I,
+                              __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
                                     (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
                                     (__v8sf)__A);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,
                                __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
@@ -3721,7 +3725,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v8sf)(__m256)__I);
   }
 
-  static __inline__ __m256 __DEFAULT_FN_ATTRS256
+  static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I,
                                __m256 __B) {
     return (__m256)__builtin_ia32_selectps_256(__U,
@@ -3729,13 +3733,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v8sf)_mm256_setzero_ps());
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
     return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
                                                   (__v2di)__B);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I,
                               __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3743,7 +3747,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)__A);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3751,7 +3755,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)__I);
   }
 
-  static __inline__ __m128i __DEFAULT_FN_ATTRS128
+  static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
   _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I,
                                __m128i __B) {
     return (__m128i)__builtin_ia32_selectq_128(__U,
@@ -3759,14 +3763,13 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                     (__v2di)_mm_setzero_si128());
   }
 
-
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
     return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
                                                   (__v4di) __B);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I,
                                  __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
@@ -3774,7 +3777,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v4di)__A);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
@@ -3782,7 +3785,7 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
                                  (__v4di)__I);
   }
 
-  static __inline__ __m256i __DEFAULT_FN_ATTRS256
+  static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
   _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I,
                                   __m256i __B) {
     return (__m256i)__builtin_ia32_selectq_256(__U,
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
index e452d5f0920e9..30df01caed6cf 100644
--- a/clang/lib/Headers/avxifmaintrin.h
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -17,11 +17,11 @@
 /* Define the default attributes for the functions in this file. */
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS128                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avxifma"), __min_vector_width__(128)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(128))) constexpr
 #define __DEFAULT_FN_ATTRS256                                                  \
-  constexpr __attribute__((__always_inline__, __nodebug__,                     \
-                           __target__("avxifma"), __min_vector_width__(256)))
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(256))) constexpr
 #else
 #define __DEFAULT_FN_ATTRS128                                                  \
   __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
diff --git a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
index a918af39e4074..208776eb7840e 100644
--- a/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_alias_intrinsics.h
@@ -1052,6 +1052,27 @@ float3 exp2(float3);
 _HLSL_BUILTIN_ALIAS(__builtin_elementwise_exp2)
 float4 exp2(float4);
 
+//===----------------------------------------------------------------------===//
+// f16tof32 builtins
+//===----------------------------------------------------------------------===//
+
+/// \fn float f16tof32(uint x)
+/// \brief Returns the half value stored in the low 16 bits of the uint arg
+/// converted to a float.
+/// \param x The uint containing two half values.
+///
+/// The float value of the half value found in the low 16 bits of the \a xi
+/// parameter.
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float f16tof32(uint);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float2 f16tof32(uint2);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float3 f16tof32(uint3);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_elementwise_f16tof32)
+float4 f16tof32(uint4);
+
 //===----------------------------------------------------------------------===//
 // firstbithigh builtins
 //===----------------------------------------------------------------------===//
@@ -2090,9 +2111,17 @@ T select(bool, T, T);
 /// \param FalseVals The vector values are chosen from when conditions are
 /// false.
 
-template <typename T, int Sz>
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>);
+vector<T, 2> select(vector<bool, 2>, vector<T, 2>, vector<T, 2>);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 3> select(vector<bool, 3>, vector<T, 3>, vector<T, 3>);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 4> select(vector<bool, 4>, vector<T, 4>, vector<T, 4>);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, T TrueVal,
 ///                         vector<T,Sz> FalseVals)
@@ -2102,9 +2131,17 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, vector<T, Sz>);
 /// \param FalseVals The vector values are chosen from when conditions are
 /// false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 2> select(vector<bool, 2>, T, vector<T, 2>);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 3> select(vector<bool, 3>, T, vector<T, 3>);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>);
+vector<T, 4> select(vector<bool, 4>, T, vector<T, 4>);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals,
 ///                         T FalseVal)
@@ -2113,9 +2150,17 @@ vector<T, Sz> select(vector<bool, Sz>, T, vector<T, Sz>);
 /// \param TrueVals The vector values are chosen from when conditions are true.
 /// \param FalseVal The scalar value to splat from when conditions are false.
 
-template <typename T, int Sz>
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T);
+vector<T, 2> select(vector<bool, 2>, vector<T, 2>, T);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 3> select(vector<bool, 3>, vector<T, 3>, T);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+vector<T, 4> select(vector<bool, 4>, vector<T, 4>, T);
 
 /// \fn vector<T,Sz> select(vector<bool,Sz> Conds, vector<T,Sz> TrueVals,
 ///                         T FalseVal)
@@ -2124,10 +2169,20 @@ vector<T, Sz> select(vector<bool, Sz>, vector<T, Sz>, T);
 /// \param TrueVal The scalar value to splat from when conditions are true.
 /// \param FalseVal The scalar value to splat from when conditions are false.
 
-template <typename T, int Sz>
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 2>> select(
+    vector<bool, 2>, T, T);
+
+template <typename T>
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 3>> select(
+    vector<bool, 3>, T, T);
+
+template <typename T>
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_select)
-__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, Sz>> select(
-    vector<bool, Sz>, T, T);
+__detail::enable_if_t<__detail::is_arithmetic<T>::Value, vector<T, 4>> select(
+    vector<bool, 4>, T, T);
 
 //===----------------------------------------------------------------------===//
 // sin builtins
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 2e4d533356569..c13dd3fd48ac8 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -253,6 +253,11 @@ module _Builtin_stdbool [system] {
   export *
 }
 
+module _Builtin_stdckdint [system] {
+  header "stdckdint.h"
+  export *
+}
+
 module _Builtin_stdcountof [system] {
   header "stdcountof.h"
   export *
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 637a08fe4dcdb..b8202ea11be36 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -258,6 +258,7 @@ static bool isBuiltinHeaderName(StringRef FileName) {
            .Case("stdarg.h", true)
            .Case("stdatomic.h", true)
            .Case("stdbool.h", true)
+           .Case("stdckdint.h", true)
            .Case("stdcountof.h", true)
            .Case("stddef.h", true)
            .Case("stdint.h", true)
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 6a5e5d4bad3a6..891c8ab7f3155 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -4018,7 +4018,7 @@ void Preprocessor::HandleEmbedDirective(SourceLocation HashLoc, Token &EmbedTok,
       this->LookupEmbedFile(Filename, isAngled, true, LookupFromFile);
   if (!MaybeFileRef) {
     // could not find file
-    if (Callbacks && Callbacks->EmbedFileNotFound(OriginalFilename)) {
+    if (Callbacks && Callbacks->EmbedFileNotFound(Filename)) {
       return;
     }
     Diag(FilenameTok, diag::err_pp_file_not_found) << Filename;
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 7e4a164e34eda..5fcb659768655 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -5370,8 +5370,13 @@ void Parser::ParseEnumBody(SourceLocation StartLoc, Decl *EnumDecl,
   T.consumeOpen();
 
   // C does not allow an empty enumerator-list, C++ does [dcl.enum].
-  if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus)
-    Diag(Tok, diag::err_empty_enum);
+  if (Tok.is(tok::r_brace) && !getLangOpts().CPlusPlus) {
+    if (getLangOpts().MicrosoftExt)
+      Diag(T.getOpenLocation(), diag::ext_ms_c_empty_enum_type)
+          << SourceRange(T.getOpenLocation(), Tok.getLocation());
+    else
+      Diag(Tok, diag::err_empty_enum);
+  }
 
   SmallVector<Decl *, 32> EnumConstantDecls;
   SmallVector<SuppressAccessChecks, 32> EnumAvailabilityDiags;
diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 92038985f9163..fb45db1139349 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -813,7 +813,7 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx,
           return StmtError();
       }
     } else {
-      LHS = Expr;
+      LHS = Actions.ActOnCaseExpr(CaseLoc, Expr);
       MissingCase = false;
     }
 
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 140b709dbb651..0997966f29f5e 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -29,7 +29,9 @@
 #include "clang/Analysis/Analyses/CFGReachabilityAnalysis.h"
 #include "clang/Analysis/Analyses/CalledOnceCheck.h"
 #include "clang/Analysis/Analyses/Consumed.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Facts.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h"
+#include "clang/Analysis/Analyses/LifetimeSafety/Origins.h"
 #include "clang/Analysis/Analyses/ReachableCode.h"
 #include "clang/Analysis/Analyses/ThreadSafety.h"
 #include "clang/Analysis/Analyses/UninitializedValues.h"
@@ -52,7 +54,9 @@
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <deque>
 #include <iterator>
@@ -3065,7 +3069,11 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   if (EnableLifetimeSafetyAnalysis && S.getLangOpts().CPlusPlus) {
     if (AC.getCFG()) {
       lifetimes::LifetimeSafetyReporterImpl LifetimeSafetyReporter(S);
-      lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter);
+      std::unique_ptr<clang::lifetimes::internal::LifetimeSafetyAnalysis>
+          Analysis =
+              lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter);
+      if (S.CollectStats)
+        FindMissingOrigins(AC, Analysis->getFactManager());
     }
   }
   // Check for violations of "called once" parameter properties.
@@ -3131,9 +3139,28 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   }
 }
 
+void clang::sema::AnalysisBasedWarnings::FindMissingOrigins(
+    AnalysisDeclContext &AC, lifetimes::internal::FactManager &FactMgr) {
+  if (AC.getCFG()) {
+    for (const auto &[expr, count] :
+         FactMgr.getOriginMgr().getMissingOrigins()) {
+      MissingOriginCount[expr] += count;
+    }
+  }
+}
+
 void clang::sema::AnalysisBasedWarnings::PrintStats() const {
+  // clang::lifetimes::internal::LifetimeSafetyAnalysis::PrintStats(llvm::errs());
+  llvm::errs() << "\n*** LifetimeSafety Missing Origin Stats "
+                  "(expression_type : count) :\n";
+  unsigned totalMissingOrigins = 0;
+  for (const auto &[expr, count] : MissingOriginCount) {
+    llvm::errs() << expr << " : " << count << '\n';
+    totalMissingOrigins += count;
+  }
+  llvm::errs() << "Total missing origins: " << totalMissingOrigins << "\n";
+  llvm::errs() << "****************************************\n";
   llvm::errs() << "\n*** Analysis Based Warnings Stats:\n";
-
   unsigned NumCFGsBuilt = NumFunctionsAnalyzed - NumFunctionsWithBadCFGs;
   unsigned AvgCFGBlocksPerFunction =
       !NumCFGsBuilt ? 0 : NumCFGBlocks/NumCFGsBuilt;
diff --git a/clang/lib/Sema/SemaFunctionEffects.cpp b/clang/lib/Sema/SemaFunctionEffects.cpp
index 8590ee831084f..4b63eb7df1054 100644
--- a/clang/lib/Sema/SemaFunctionEffects.cpp
+++ b/clang/lib/Sema/SemaFunctionEffects.cpp
@@ -1208,8 +1208,16 @@ class Analyzer {
         return true;
       }
 
-      // No Decl, just an Expr. Just check based on its type.
-      checkIndirectCall(Call, CalleeExpr->getType());
+      // No Decl, just an Expr. Just check based on its type. Bound member
+      // functions are a special expression type and need to be specially
+      // unpacked.
+      QualType CalleeExprQT = CalleeExpr->getType();
+      if (CalleeExpr->isBoundMemberFunction(Outer.S.getASTContext())) {
+        QualType QT = Expr::findBoundMemberType(CalleeExpr);
+        if (!QT.isNull())
+          CalleeExprQT = QT;
+      }
+      checkIndirectCall(Call, CalleeExprQT);
 
       return true;
     }
@@ -1271,7 +1279,15 @@ class Analyzer {
       const CXXConstructorDecl *Ctor = Construct->getConstructor();
       CallableInfo CI(*Ctor);
       followCall(CI, Construct->getLocation());
+      return true;
+    }
 
+    bool VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *BTE) override {
+      const CXXDestructorDecl *Dtor = BTE->getTemporary()->getDestructor();
+      if (Dtor != nullptr) {
+        CallableInfo CI(*Dtor);
+        followCall(CI, BTE->getBeginLoc());
+      }
       return true;
     }
 
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 94a490a8f68dc..b9707f0036765 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -2802,6 +2802,23 @@ static bool CheckUnsignedIntRepresentation(Sema *S, SourceLocation Loc,
   return false;
 }
 
+static bool CheckExpectedBitWidth(Sema *S, CallExpr *TheCall,
+                                  unsigned ArgOrdinal, unsigned Width) {
+  QualType ArgTy = TheCall->getArg(0)->getType();
+  if (auto *VTy = ArgTy->getAs<VectorType>())
+    ArgTy = VTy->getElementType();
+  // ensure arg type has expected bit width
+  uint64_t ElementBitCount =
+      S->getASTContext().getTypeSizeInChars(ArgTy).getQuantity() * 8;
+  if (ElementBitCount != Width) {
+    S->Diag(TheCall->getArg(0)->getBeginLoc(),
+            diag::err_integer_incorrect_bit_count)
+        << Width << ElementBitCount;
+    return true;
+  }
+  return false;
+}
+
 static void SetElementTypeAsReturnType(Sema *S, CallExpr *TheCall,
                                        QualType ReturnType) {
   auto *VecTyA = TheCall->getArg(0)->getType()->getAs<VectorType>();
@@ -2961,24 +2978,16 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
                                    CheckUnsignedIntVecRepresentation))
       return true;
 
-    auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>();
     // ensure arg integers are 32-bits
-    uint64_t ElementBitCount = getASTContext()
-                                   .getTypeSizeInChars(VTy->getElementType())
-                                   .getQuantity() *
-                               8;
-    if (ElementBitCount != 32) {
-      SemaRef.Diag(TheCall->getBeginLoc(),
-                   diag::err_integer_incorrect_bit_count)
-          << 32 << ElementBitCount;
+    if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32))
       return true;
-    }
 
     // ensure both args are vectors of total bit size of a multiple of 64
+    auto *VTy = TheCall->getArg(0)->getType()->getAs<VectorType>();
     int NumElementsArg = VTy->getNumElements();
     if (NumElementsArg != 2 && NumElementsArg != 4) {
       SemaRef.Diag(TheCall->getBeginLoc(), diag::err_vector_incorrect_bit_count)
-          << 1 /*a multiple of*/ << 64 << NumElementsArg * ElementBitCount;
+          << 1 /*a multiple of*/ << 64 << NumElementsArg * 32;
       return true;
     }
 
@@ -3295,7 +3304,7 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     break;
   }
   // Note these are llvm builtins that we want to catch invalid intrinsic
-  // generation. Normal handling of these builitns will occur elsewhere.
+  // generation. Normal handling of these builtins will occur elsewhere.
   case Builtin::BI__builtin_elementwise_bitreverse: {
     // does not include a check for number of arguments
     // because that is done previously
@@ -3405,6 +3414,30 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     }
     break;
   }
+  case Builtin::BI__builtin_hlsl_elementwise_f16tof32: {
+    if (SemaRef.checkArgCount(TheCall, 1))
+      return true;
+    if (CheckAllArgTypesAreCorrect(&SemaRef, TheCall,
+                                   CheckUnsignedIntRepresentation))
+      return true;
+    // ensure arg integers are 32 bits
+    if (CheckExpectedBitWidth(&SemaRef, TheCall, 0, 32))
+      return true;
+    // check it wasn't a bool type
+    QualType ArgTy = TheCall->getArg(0)->getType();
+    if (auto *VTy = ArgTy->getAs<VectorType>())
+      ArgTy = VTy->getElementType();
+    if (ArgTy->isBooleanType()) {
+      SemaRef.Diag(TheCall->getArg(0)->getBeginLoc(),
+                   diag::err_builtin_invalid_arg_type)
+          << 1 << /* scalar or vector of */ 5 << /* unsigned int */ 3
+          << /* no fp */ 0 << TheCall->getArg(0)->getType();
+      return true;
+    }
+
+    SetElementTypeAsReturnType(&SemaRef, TheCall, getASTContext().FloatTy);
+    break;
+  }
   }
   return false;
 }
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 073010d16b428..cc6ddf568d346 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -1897,26 +1897,29 @@ void InitListChecker::CheckMatrixType(const InitializedEntity &Entity,
     return;
 
   const ConstantMatrixType *MT = DeclType->castAs<ConstantMatrixType>();
+
+  // For HLSL, the error reporting for this case is handled in SemaHLSL's
+  // initializer list diagnostics. That means the execution should require
+  // getNumElementsFlattened to equal getNumInits. In other words the execution
+  // should never reach this point if this condition is not true".
+  assert(IList->getNumInits() == MT->getNumElementsFlattened() &&
+         "Inits must equal Matrix element count");
+
   QualType ElemTy = MT->getElementType();
-  const unsigned MaxElts = MT->getNumElementsFlattened();
 
-  unsigned NumEltsInit = 0;
+  Index = 0;
   InitializedEntity ElemEnt =
       InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);
 
-  while (NumEltsInit < MaxElts && Index < IList->getNumInits()) {
+  while (Index < IList->getNumInits()) {
     // Not a sublist: just consume directly.
-    ElemEnt.setElementIndex(Index);
-    CheckSubElementType(ElemEnt, IList, ElemTy, Index, StructuredList,
+    unsigned ColMajorIndex = (Index % MT->getNumRows()) * MT->getNumColumns() +
+                             (Index / MT->getNumRows());
+    ElemEnt.setElementIndex(ColMajorIndex);
+    CheckSubElementType(ElemEnt, IList, ElemTy, ColMajorIndex, StructuredList,
                         StructuredIndex);
-    ++NumEltsInit;
+    ++Index;
   }
-
-  // For HLSL The error for this case is handled in SemaHLSL's initializer
-  // list diagnostics, That means the execution should require NumEltsInit
-  // to equal Max initializers. In other words  execution should never
-  // reach this point if this condition is not true".
-  assert(NumEltsInit == MaxElts && "NumEltsInit must equal MaxElts");
 }
 
 void InitListChecker::CheckVectorType(const InitializedEntity &Entity,
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index f39896336053e..5b3ef1adf38e3 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3281,6 +3281,9 @@ static Scope *FindLabeledBreakContinueScope(Sema &S, Scope *CurScope,
                                             SourceLocation LabelLoc,
                                             bool IsContinue) {
   assert(Target && "not a named break/continue?");
+
+  Target->markUsed(S.Context);
+
   Scope *Found = nullptr;
   for (Scope *Scope = CurScope; Scope; Scope = Scope->getParent()) {
     if (Scope->isFunctionScope())
diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index ad50600f6399c..bfcd3978817ca 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -659,7 +659,8 @@ struct ConvertConstructorToDeductionGuideTransform {
                 SemaRef, MaterializedTypedefs, NestedPattern,
                 TransformingOuterPatterns ? &Args : nullptr)
                 .transform(NewDI);
-
+    if (!NewDI)
+      return nullptr;
     // Resolving a wording defect, we also inherit default arguments from the
     // constructor.
     ExprResult NewDefArg;
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 280b3c92cce14..c483930705057 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2358,6 +2358,11 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
     return QualType();
   }
 
+  if (VecSize->isNegative()) {
+    Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size);
+    return QualType();
+  }
+
   if (CurType->isDependentType())
     return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
                                           VectorKind::Generic);
@@ -2394,7 +2399,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
                                VectorKind::Generic);
 }
 
-QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
+QualType Sema::BuildExtVectorType(QualType T, Expr *SizeExpr,
                                   SourceLocation AttrLoc) {
   // Unlike gcc's vector_size attribute, we do not allow vectors to be defined
   // in conjunction with complex types (pointers, arrays, functions, etc.).
@@ -2417,35 +2422,40 @@ QualType Sema::BuildExtVectorType(QualType T, Expr *ArraySize,
       BIT && CheckBitIntElementType(*this, AttrLoc, BIT))
     return QualType();
 
-  if (!ArraySize->isTypeDependent() && !ArraySize->isValueDependent()) {
-    std::optional<llvm::APSInt> vecSize =
-        ArraySize->getIntegerConstantExpr(Context);
-    if (!vecSize) {
+  if (!SizeExpr->isTypeDependent() && !SizeExpr->isValueDependent()) {
+    std::optional<llvm::APSInt> VecSize =
+        SizeExpr->getIntegerConstantExpr(Context);
+    if (!VecSize) {
       Diag(AttrLoc, diag::err_attribute_argument_type)
-        << "ext_vector_type" << AANT_ArgumentIntegerConstant
-        << ArraySize->getSourceRange();
+          << "ext_vector_type" << AANT_ArgumentIntegerConstant
+          << SizeExpr->getSourceRange();
+      return QualType();
+    }
+
+    if (VecSize->isNegative()) {
+      Diag(SizeExpr->getExprLoc(), diag::err_attribute_vec_negative_size);
       return QualType();
     }
 
-    if (!vecSize->isIntN(32)) {
+    if (!VecSize->isIntN(32)) {
       Diag(AttrLoc, diag::err_attribute_size_too_large)
-          << ArraySize->getSourceRange() << "vector";
+          << SizeExpr->getSourceRange() << "vector";
       return QualType();
     }
     // Unlike gcc's vector_size attribute, the size is specified as the
     // number of elements, not the number of bytes.
-    unsigned vectorSize = static_cast<unsigned>(vecSize->getZExtValue());
+    unsigned VectorSize = static_cast<unsigned>(VecSize->getZExtValue());
 
-    if (vectorSize == 0) {
+    if (VectorSize == 0) {
       Diag(AttrLoc, diag::err_attribute_zero_size)
-          << ArraySize->getSourceRange() << "vector";
+          << SizeExpr->getSourceRange() << "vector";
       return QualType();
     }
 
-    return Context.getExtVectorType(T, vectorSize);
+    return Context.getExtVectorType(T, VectorSize);
   }
 
-  return Context.getDependentSizedExtVectorType(T, ArraySize, AttrLoc);
+  return Context.getDependentSizedExtVectorType(T, SizeExpr, AttrLoc);
 }
 
 QualType Sema::BuildMatrixType(QualType ElementTy, Expr *NumRows, Expr *NumCols,
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 3ac338e013deb..b1fd151790d96 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -4374,8 +4374,7 @@ class ASTDeclContextNameLookupTrait
     // parent of parent. We DON'T remove the enum constant from its parent. So
     // we don't need to care about merging problems here.
     if (auto *ECD = dyn_cast<EnumConstantDecl>(D);
-        ECD && DC.isFileContext() && ECD->getOwningModule() &&
-        ECD->getTopLevelOwningNamedModule()->isNamedModule()) {
+        ECD && DC.isFileContext() && ECD->getTopLevelOwningNamedModule()) {
       if (llvm::all_of(
               DC.noload_lookup(
                   cast<EnumDecl>(ECD->getDeclContext())->getDeclName()),
diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index 2838533c1a406..4f4824a3616ce 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -714,11 +714,6 @@ class RegionStoreManager : public StoreManager {
     return getBinding(getRegionBindings(S), L, T);
   }
 
-  std::optional<SVal> getUniqueDefaultBinding(RegionBindingsConstRef B,
-                                              const TypedValueRegion *R) const;
-  std::optional<SVal>
-  getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const;
-
   std::optional<SVal> getDefaultBinding(Store S, const MemRegion *R) override {
     RegionBindingsRef B = getRegionBindings(S);
     // Default bindings are always applied over a base region so look up the
@@ -2465,11 +2460,6 @@ SVal RegionStoreManager::getBindingForStruct(RegionBindingsConstRef B,
   // behavior doesn't depend on the struct layout.
   // This way even an empty struct can carry taint, no matter if creduce drops
   // the last field member or not.
-
-  // Try to avoid creating a LCV if it would anyways just refer to a single
-  // default binding.
-  if (std::optional<SVal> Val = getUniqueDefaultBinding(B, R))
-    return *Val;
   return createLazyBinding(B, R);
 }
 
@@ -2757,50 +2747,12 @@ RegionStoreManager::bindVector(LimitedRegionBindingsConstRef B,
   return NewB;
 }
 
-std::optional<SVal>
-RegionStoreManager::getUniqueDefaultBinding(RegionBindingsConstRef B,
-                                            const TypedValueRegion *R) const {
-  if (R != R->getBaseRegion())
-    return std::nullopt;
-
-  const auto *Cluster = B.lookup(R);
-  if (!Cluster || !llvm::hasSingleElement(*Cluster))
-    return std::nullopt;
-
-  const auto [Key, Value] = *Cluster->begin();
-  return Key.isDirect() ? std::optional<SVal>{} : Value;
-}
-
-std::optional<SVal>
-RegionStoreManager::getUniqueDefaultBinding(nonloc::LazyCompoundVal LCV) const {
-  auto B = getRegionBindings(LCV.getStore());
-  return getUniqueDefaultBinding(B, LCV.getRegion());
-}
-
 std::optional<LimitedRegionBindingsRef> RegionStoreManager::tryBindSmallStruct(
     LimitedRegionBindingsConstRef B, const TypedValueRegion *R,
     const RecordDecl *RD, nonloc::LazyCompoundVal LCV) {
   if (B.hasExhaustedBindingLimit())
     return B.withValuesEscaped(LCV);
 
-  // If we try to copy a Conjured value representing the value of the whole
-  // struct, don't try to element-wise copy each field.
-  // That would unnecessarily bind Derived symbols slicing off the subregion for
-  // the field from the whole Conjured symbol.
-  //
-  //   struct Window { int width; int height; };
-  //   Window getWindow(); <-- opaque fn.
-  //   Window w = getWindow(); <-- conjures a new Window.
-  //   Window w2 = w; <-- trivial copy "w", calling "tryBindSmallStruct"
-  //
-  // We should not end up with a new Store for "w2" like this:
-  //   Direct [ 0..31]: Derived{Conj{}, w.width}
-  //   Direct [32..63]: Derived{Conj{}, w.height}
-  // Instead, we should just bind that Conjured value instead.
-  if (std::optional<SVal> Val = getUniqueDefaultBinding(LCV)) {
-    return B.addBinding(BindingKey::Make(R, BindingKey::Default), Val.value());
-  }
-
   FieldVector Fields;
 
   if (const CXXRecordDecl *Class = dyn_cast<CXXRecordDecl>(RD))
diff --git a/clang/test/AST/ByteCode/cxx11.cpp b/clang/test/AST/ByteCode/cxx11.cpp
index 427d3a106656b..e283a7b42e554 100644
--- a/clang/test/AST/ByteCode/cxx11.cpp
+++ b/clang/test/AST/ByteCode/cxx11.cpp
@@ -374,7 +374,7 @@ namespace GH150709 {
 namespace DiscardedAddrLabel {
   void foo(void) {
   L:
-    *&&L; // both-error {{indirection not permitted}} \
+    *&&L; // both-error {{indirection not permitted on operand of type 'void *'}} \
           // both-warning {{expression result unused}}
   }
 }
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 83f32c97c50c7..4799ebe25dde1 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -1882,3 +1882,14 @@ namespace MethodWillHaveBody {
   }
   int n = f(0); // both-note {{instantiation of}}
 }
+
+namespace StaticRedecl {
+  struct T {
+    static T tt;
+    constexpr T() : p(&tt) {}
+    T *p;
+  };
+  T T::tt;
+  constexpr T t;
+  static_assert(t.p == &T::tt, "");
+}
diff --git a/clang/test/AST/HLSL/matrix-constructors.hlsl b/clang/test/AST/HLSL/matrix-constructors.hlsl
index 0a2f03c7c0fac..e1a9c53e2c602 100644
--- a/clang/test/AST/HLSL/matrix-constructors.hlsl
+++ b/clang/test/AST/HLSL/matrix-constructors.hlsl
@@ -9,21 +9,20 @@ typedef float float4 __attribute__((ext_vector_type(4)));
 
 [numthreads(1,1,1)]
 void ok() {
-
 // CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} A 'float2x3':'matrix<float, 2, 3>' cinit
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 A = float2x3(1,2,3,4,5,6);
 
@@ -57,6 +56,8 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
@@ -68,12 +69,10 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 D = float2x3(float2(1,2), 3, 4, 5, 6);
 
@@ -97,9 +96,9 @@ void ok() {
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
@@ -107,10 +106,12 @@ void ok() {
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
@@ -120,9 +121,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 E = float2x3(float2(1,2), float2(3,4), 5, 6);
@@ -158,7 +157,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
@@ -172,7 +171,9 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
@@ -186,9 +187,7 @@ void ok() {
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
   float2x3 F = float2x3(float4(1,2,3,4), 5, 6);
@@ -202,10 +201,10 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
@@ -215,41 +214,41 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
 float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);  
@@ -262,13 +261,13 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
   float2 Vec2 = float2(1.0, 2.0);   
   float2x2 H = float2x2(Vec2,3,4);
@@ -281,10 +280,10 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'l' 'int'
@@ -300,15 +299,15 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
-// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
-// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
   struct S { float2 f;  float a;} s;
   float2x2 J = float2x2(s.f, s.a, s.a);
@@ -317,8 +316,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00
   typedef float2x2 second_level_of_typedefs;
   second_level_of_typedefs L = float2x2(1.0f, 2.0f, 3.0f, 4.0f);
@@ -327,8 +326,8 @@ float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);
 // CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>' functional cast to second_level_of_typedefs <NoOp>
 // CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>'
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
-// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
 // CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00
   float2x2 M = second_level_of_typedefs(1.0f, 2.0f, 3.0f, 4.0f);
 
@@ -367,12 +366,12 @@ float2x1 GettingStrange = float2x1(s2, s2);
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
 // CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
diff --git a/clang/test/AST/HLSL/matrix-general-initializer.hlsl b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
index 14c950acb7baf..1a631113eb0f0 100644
--- a/clang/test/AST/HLSL/matrix-general-initializer.hlsl
+++ b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
@@ -26,14 +26,6 @@ void ok() {
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
@@ -42,20 +34,8 @@ void ok() {
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
-// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' x
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
-// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
 // CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
@@ -66,10 +46,30 @@ void ok() {
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
-// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
-// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' x
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}, col:{{[0-9a-fA-F]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9a-fA-F]+}}> 'int' 4
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 
@@ -84,12 +84,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -105,12 +105,12 @@ float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -138,7 +138,7 @@ S s = {m};
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
@@ -146,7 +146,7 @@ S s = {m};
 // CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
 // CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
@@ -169,25 +169,25 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
@@ -199,26 +199,26 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
@@ -229,25 +229,25 @@ float2x2 m2 = {0.xxxx};
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
 // CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
 // CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
-// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
 // CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
diff --git a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
index ce37a29655668..2f9c2ac247497 100644
--- a/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
+++ b/clang/test/Analysis/LifetimeSafety/CMakeLists.txt
@@ -15,6 +15,13 @@ set(LIFETIME_BENCHMARK_REQUIREMENTS
 set(LIFETIME_BENCHMARK_OUTPUT_DIR
   "${CMAKE_CURRENT_BINARY_DIR}/benchmark_results")
 
+if(WIN32)
+  set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE
+    "${LIFETIME_BENCHMARK_VENV_DIR}/Scripts/python")
+else()
+  set(LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE
+    "${LIFETIME_BENCHMARK_VENV_DIR}/bin/python")
+endif()
 
 if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREMENTS})
 
@@ -22,7 +29,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME
   add_custom_command(
     OUTPUT ${LIFETIME_BENCHMARK_VENV_DIR}/pyvenv.cfg
     COMMAND ${Python3_EXECUTABLE} -m venv ${LIFETIME_BENCHMARK_VENV_DIR}
-    COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS}
+    COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} -m pip install -r ${LIFETIME_BENCHMARK_REQUIREMENTS}
     DEPENDS ${LIFETIME_BENCHMARK_REQUIREMENTS}
     COMMENT "Creating Python virtual environment and installing dependencies for benchmark..."
   )
@@ -32,7 +39,7 @@ if(EXISTS ${LIFETIME_BENCHMARK_SCRIPT} AND EXISTS ${LIFETIME_BENCHMARK_REQUIREME
 
   # Main benchmark target
   add_custom_target(benchmark_lifetime_safety_analysis
-    COMMAND ${LIFETIME_BENCHMARK_VENV_DIR}/bin/python ${LIFETIME_BENCHMARK_SCRIPT}
+    COMMAND ${LIFETIME_BENCHMARK_VENV_PYTHON_EXECUTABLE} ${LIFETIME_BENCHMARK_SCRIPT}
             --clang-binary ${LLVM_BINARY_DIR}/bin/clang
             --output-dir ${LIFETIME_BENCHMARK_OUTPUT_DIR}
 
diff --git a/clang/test/Analysis/NewDelete-checker-test.cpp b/clang/test/Analysis/NewDelete-checker-test.cpp
index c417b9c2ac97e..fd831cc0985cc 100644
--- a/clang/test/Analysis/NewDelete-checker-test.cpp
+++ b/clang/test/Analysis/NewDelete-checker-test.cpp
@@ -3,13 +3,13 @@
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete
 //
-// RUN: %clang_analyze_cc1 -DLEAKS -std=c++11 -fblocks %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
 // RUN:   -verify=expected,newdelete,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
-// RUN: %clang_analyze_cc1 -std=c++11 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++11 -fblocks %s \
 // RUN:   -verify=expected,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
@@ -19,13 +19,13 @@
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete
 //
-// RUN: %clang_analyze_cc1 -DLEAKS -std=c++17 -fblocks %s \
+// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \
 // RUN:   -verify=expected,newdelete,leak \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDelete \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks
 //
-// RUN: %clang_analyze_cc1 -std=c++17 -fblocks -verify %s \
+// RUN: %clang_analyze_cc1 -std=c++17 -fblocks %s \
 // RUN:   -verify=expected,leak,inspection \
 // RUN:   -analyzer-checker=core \
 // RUN:   -analyzer-checker=cplusplus.NewDeleteLeaks \
@@ -503,3 +503,75 @@ namespace optional_union {
     custom_union_t a;
   } // leak-warning{{Potential leak of memory pointed to by 'a.present.q'}}
 }
+
+namespace gh153782 {
+
+// Ensure we do not regress on the following use case.
+
+namespace mutually_exclusive_test_case_1 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}}
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+void test_non_trivial_struct_assignment() {
+  StorageWrapper* object = new StorageWrapper[]{StorageWrapper()};
+  object[0] = StorageWrapper(); // This assignment leads to the double-free.
+}
+} // mutually_exclusive_test_case_1
+
+namespace mutually_exclusive_test_case_2 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; }
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+void test_non_trivial_struct_assignment() {
+  StorageWrapper* object = new StorageWrapper[]{StorageWrapper()};
+  // object[0] = StorageWrapper(); // Remove the source of double free to make the potential leak appear.
+} // leak-warning{{Potential leak of memory pointed to by 'object'}}
+} // mutually_exclusive_test_case_2
+
+namespace mutually_exclusive_test_case_3 {
+struct StorageWrapper {
+  // Imagine the destructor and copy constructor both call a reset() function (among other things).
+  ~StorageWrapper() { delete parts; }
+  StorageWrapper(StorageWrapper const&) = default;
+
+  // Mind that there is no `parts = other.parts` assignment -- this is the bug we would like to find.
+  void operator=(StorageWrapper&& other) { delete parts; } // newdelete-warning{{Attempt to release already released memory}}
+
+  // Not provided, typically would do `parts = new long`.
+  StorageWrapper();
+
+  long* parts;
+};
+
+struct TestDoubleFreeWithInitializerList {
+  StorageWrapper* Object;
+  TestDoubleFreeWithInitializerList()
+  : Object(new StorageWrapper[]{StorageWrapper()}) {
+    Object[0] = StorageWrapper(); // This assignment leads to the double-free.
+  }
+};
+} // mutually_exclusive_test_case_3
+
+} // namespace gh153782
diff --git a/clang/test/Analysis/ctor-trivial-copy.cpp b/clang/test/Analysis/ctor-trivial-copy.cpp
index 940ff9ba3ed9c..44990fc631d6d 100644
--- a/clang/test/Analysis/ctor-trivial-copy.cpp
+++ b/clang/test/Analysis/ctor-trivial-copy.cpp
@@ -5,8 +5,6 @@
 void clang_analyzer_printState();
 template <typename T> void clang_analyzer_dump_lref(T& param);
 template <typename T> void clang_analyzer_dump_val(T param);
-template <typename T> void clang_analyzer_denote(T param, const char *name);
-template <typename T> void clang_analyzer_express(T param);
 template <typename T> T conjure();
 template <typename... Ts> void nop(const Ts &... args) {}
 
@@ -42,10 +40,16 @@ void test_assign_return() {
 namespace trivial_struct_copy {
 
 void _01_empty_structs() {
-  clang_analyzer_dump_val(conjure<empty>()); // expected-warning {{conj_$}}
+  clang_analyzer_dump_val(conjure<empty>()); // expected-warning {{lazyCompoundVal}}
   empty Empty = conjure<empty>();
   empty Empty2 = Empty;
   empty Empty3 = Empty2;
+  // All of these should refer to the exact same LCV, because all of
+  // these trivial copies refer to the original conjured value.
+  // There were Unknown before:
+  clang_analyzer_dump_val(Empty);  // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Empty2); // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Empty3); // expected-warning {{lazyCompoundVal}}
 
   // We only have binding for the original Empty object, because copying empty
   // objects is a no-op in the performTrivialCopy. This is fine, because empty
@@ -67,20 +71,18 @@ void _01_empty_structs() {
 }
 
 void _02_structs_with_members() {
-  clang_analyzer_dump_val(conjure<aggr>()); // expected-warning {{conj_$}}
+  clang_analyzer_dump_val(conjure<aggr>()); // expected-warning {{lazyCompoundVal}}
   aggr Aggr = conjure<aggr>();
   aggr Aggr2 = Aggr;
   aggr Aggr3 = Aggr2;
-  // All of these should refer to the exact same symbol, because all of
+  // All of these should refer to the exact same LCV, because all of
   // these trivial copies refer to the original conjured value.
-  clang_analyzer_denote(Aggr, "$Aggr");
-  clang_analyzer_express(Aggr);  // expected-warning {{$Aggr}}
-  clang_analyzer_express(Aggr2); // expected-warning {{$Aggr}}
-  clang_analyzer_express(Aggr3); // expected-warning {{$Aggr}}
-
-  // We should have the same Conjured symbol for "Aggr", "Aggr2" and "Aggr3".
-  // We used to have Derived symbols for the individual fields that were
-  // copied as part of copying the whole struct.
+  clang_analyzer_dump_val(Aggr);  // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Aggr2); // expected-warning {{lazyCompoundVal}}
+  clang_analyzer_dump_val(Aggr3); // expected-warning {{lazyCompoundVal}}
+
+  // We have fields in the struct we copy, thus we also have the entries for the copies
+  // (and for all of their fields).
   clang_analyzer_printState();
   // CHECK:       "store": { "pointer": "0x{{[0-9a-f]+}}", "items": [
   // CHECK-NEXT:    { "cluster": "GlobalInternalSpaceRegion", "pointer": "0x{{[0-9a-f]+}}", "items": [
@@ -93,10 +95,12 @@ void _02_structs_with_members() {
   // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ:conj_\$[0-9]+{int, LC[0-9]+, S[0-9]+, #[0-9]+}]]" }
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "Aggr2", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" }
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" },
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" }
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "Aggr3", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "[[AGGR_CONJ]]" }
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 0, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.x}" },
+  // CHECK-NEXT:      { "kind": "Direct", "offset": 32, "value": "derived_${{[0-9]+}}{[[AGGR_CONJ]],Aggr.y}" }
   // CHECK-NEXT:    ]}
   // CHECK-NEXT:  ]},
 
@@ -113,3 +117,31 @@ void entrypoint() {
 }
 
 } // namespace trivial_struct_copy
+
+namespace gh153782 {
+
+// Ensure we do not regress on the following use cases.
+// The assumption made on a field in `setPtr` should apply to the returned copy in `func`.
+struct Status { int error; };
+Status getError();
+
+Status setPtr(int **outptr, int* ptr) {
+  Status e = getError();
+  if (e.error != 0) return e; // When assuming the error field is non-zero,
+  *outptr = ptr;              // this is not executed
+  return e;
+}
+
+int func() {
+  int *ptr = nullptr;
+  int x = 42;
+  if (setPtr(&ptr, &x).error == 0) {
+    // The assumption made in get() SHOULD match the assumption about
+    // the returned value, hence the engine SHOULD NOT assume ptr is null.
+    clang_analyzer_dump_val(ptr); // expected-warning {{&x}}
+    return *ptr;
+  }
+  return 0;
+}
+
+} // namespace gh153782
diff --git a/clang/test/Analysis/explain-svals.cpp b/clang/test/Analysis/explain-svals.cpp
index dfc650223c9e7..9474aa7c7dbb1 100644
--- a/clang/test/Analysis/explain-svals.cpp
+++ b/clang/test/Analysis/explain-svals.cpp
@@ -99,7 +99,7 @@ class C {
 } // end of anonymous namespace
 
 void test_6() {
-  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \+0\)'$}}}}
+  clang_analyzer_explain(conjure_S()); // expected-warning-re{{{{^lazily frozen compound value of 1st parameter of function 'clang_analyzer_explain\(\)'$}}}}
   clang_analyzer_explain(conjure_S().z); // expected-warning-re{{{{^value derived from \(symbol of type 'int' conjured at CFG element 'conjure_S\(\) \(CXXRecordTypedCall, \)'\) for field 'z' of temporary object constructed at statement 'conjure_S\(\)'$}}}}
 }
 
diff --git a/clang/test/Analysis/iterator-modeling.cpp b/clang/test/Analysis/iterator-modeling.cpp
index 78882da4431fd..f1538839d06c8 100644
--- a/clang/test/Analysis/iterator-modeling.cpp
+++ b/clang/test/Analysis/iterator-modeling.cpp
@@ -2035,7 +2035,6 @@ void print_state(std::vector<int> &V) {
   // CHECK:      "checker_messages": [
   // CHECK:   { "checker": "alpha.cplusplus.IteratorModeling", "messages": [
   // CHECK-NEXT:     "Iterator Positions :",
-  // CHECK-NEXT:     "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}",
   // CHECK-NEXT:     "i0 : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}"
   // CHECK-NEXT:   ]}
 
@@ -2046,7 +2045,6 @@ void print_state(std::vector<int> &V) {
   // CHECK:      "checker_messages": [
   // CHECK:   { "checker": "alpha.cplusplus.IteratorModeling", "messages": [
   // CHECK-NEXT:     "Iterator Positions :",
-  // CHECK-NEXT:     "conj_$[[#]]{int, LC[[#]], S[[#]], #[[#]]} : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}",
   // CHECK-NEXT:     "i1 : Valid ; Container == SymRegion{reg_$[[#]]<std::vector<int> & V>} ; Offset == conj_$[[#]]{long, LC[[#]], S[[#]], #[[#]]}"
   // CHECK-NEXT:   ]}
 
diff --git a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
index 191af95cd2b9c..98301cf7274fc 100644
--- a/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
+++ b/clang/test/Analysis/stl-algorithm-modeling-aggressive-std-find-modeling.cpp
@@ -4,16 +4,6 @@
 // RUN:  -analyzer-config alpha.cplusplus.STLAlgorithmModeling:AggressiveStdFindModeling=true\
 // RUN:  -verify
 
-// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because
-// these tests started failing after we just directly copy the symbol
-// representing the value of a variable instead of creating a LazyCompoundVal
-// of that single conjured value.
-// In theory, it shouldn't matter if we eagerly copy the value that we would
-// "load" from the LCV once requested or just directly binding the backing symbol.
-// Yet, these tests fail, so there is likely messed up how/what the checker
-// metadata is associated with.
-// XFAIL: *
-
 #include "Inputs/system-header-simulator-cxx.h"
 
 void clang_analyzer_eval(bool);
diff --git a/clang/test/Analysis/stl-algorithm-modeling.cpp b/clang/test/Analysis/stl-algorithm-modeling.cpp
index f7029c79b0942..5549c24a8c220 100644
--- a/clang/test/Analysis/stl-algorithm-modeling.cpp
+++ b/clang/test/Analysis/stl-algorithm-modeling.cpp
@@ -3,16 +3,6 @@
 // RUN:  -analyzer-config aggressive-binary-operation-simplification=true\
 // RUN:  -verify
 
-// STLAlgorithmModeling and DebugIteratorModeling are probably bugged because
-// these tests started failing after we just directly copy the symbol
-// representing the value of a variable instead of creating a LazyCompoundVal
-// of that single conjured value.
-// In theory, it shouldn't matter if we eagerly copy the value that we would
-// "load" from the LCV once requested or just directly binding the backing symbol.
-// Yet, these tests fail, so there is likely messed up how/what the checker
-// metadata is associated with.
-// XFAIL: *
-
 #include "Inputs/system-header-simulator-cxx.h"
 
 void clang_analyzer_eval(bool);
diff --git a/clang/test/Analysis/store-dump-orders.cpp b/clang/test/Analysis/store-dump-orders.cpp
index dbe93f1c5183a..d99f581f00fe1 100644
--- a/clang/test/Analysis/store-dump-orders.cpp
+++ b/clang/test/Analysis/store-dump-orders.cpp
@@ -41,7 +41,7 @@ void test_output(int n) {
   // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "conj_$
   // CHECK-NEXT:    ]},
   // CHECK-NEXT:    { "cluster": "objfirst", "pointer": "0x{{[0-9a-f]+}}", "items": [
-  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "conj_$
+  // CHECK-NEXT:      { "kind": "Default", "offset": 0, "value": "lazyCompoundVal
   // CHECK-NEXT:      { "kind": "Direct", "offset": 320, "value": "1 S32b" },
   // CHECK-NEXT:      { "kind": "Direct", "offset": 352, "value": "2 S32b" },
   // CHECK-NEXT:      { "kind": "Direct", "offset": 384, "value": "3 S32b" }
diff --git a/clang/test/Analysis/taint-generic.cpp b/clang/test/Analysis/taint-generic.cpp
index fc7c37300d3fc..4b8d9ab68ff84 100644
--- a/clang/test/Analysis/taint-generic.cpp
+++ b/clang/test/Analysis/taint-generic.cpp
@@ -158,7 +158,11 @@ void top() {
   clang_analyzer_isTainted(E); // expected-warning {{NO}}
 
   Aggr A = mySource1<Aggr>();
-  clang_analyzer_isTainted(A);      // expected-warning {{YES}}
+  // FIXME Ideally, both A and A.data should be tainted. However, the
+  //       implementation used by e5ac9145ba29 ([analyzer][taint] Recognize
+  //       tainted LazyCompoundVals (4/4) (#115919), 2024-11-15) led to FPs and
+  //       FNs in various scenarios and had to be reverted to fix #153782.
+  clang_analyzer_isTainted(A);      // expected-warning {{NO}}
   clang_analyzer_isTainted(A.data); // expected-warning {{YES}}
 }
 } // namespace gh114270
diff --git a/clang/test/Analysis/template-param-objects.cpp b/clang/test/Analysis/template-param-objects.cpp
index b065f8756d4d8..dde95fa62cb65 100644
--- a/clang/test/Analysis/template-param-objects.cpp
+++ b/clang/test/Analysis/template-param-objects.cpp
@@ -11,7 +11,7 @@ bool operator ==(Box lhs, Box rhs) {
   return lhs.value == rhs.value;
 }
 template <Box V> void dumps() {
-  clang_analyzer_dump(V);        // expected-warning {{Unknown}}
+  clang_analyzer_dump(V);        // expected-warning {{lazyCompoundVal}}
   clang_analyzer_dump(&V);       // expected-warning {{Unknown}}
   clang_analyzer_dump(V.value);  // expected-warning {{Unknown}} FIXME: It should be '6 S32b'.
   clang_analyzer_dump(&V.value); // expected-warning {{Unknown}}
diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c
index 65799881a0cbe..d5bea8446d730 100644
--- a/clang/test/CIR/CodeGen/atomic.c
+++ b/clang/test/CIR/CodeGen/atomic.c
@@ -46,6 +46,32 @@ void f2(void) {
 // OGCG-NEXT:    store i32 42, ptr %[[SLOT]], align 4
 // OGCG:       }
 
+void f3(_Atomic(int) *p) {
+  *p = 42;
+}
+
+// CIR-LABEL: @f3
+// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !s32i, !cir.ptr<!s32i>
+
+// LLVM-LABEL: @f3
+// LLVM: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
+
+// OGCG-LABEL: @f3
+// OGCG: store atomic i32 42, ptr %{{.+}} seq_cst, align 4
+
+void f4(_Atomic(float) *p) {
+  *p = 3.14;
+}
+
+// CIR-LABEL: @f4
+// CIR: cir.store align(4) atomic(seq_cst) %{{.+}}, %{{.+}} : !cir.float, !cir.ptr<!cir.float>
+
+// LLVM-LABEL: @f4
+// LLVM: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
+
+// OGCG-LABEL: @f4
+// OGCG: store atomic float 0x40091EB860000000, ptr %{{.+}} seq_cst, align 4
+
 void load(int *ptr) {
   int x;
   __atomic_load(ptr, &x, __ATOMIC_RELAXED);
diff --git a/clang/test/CIR/CodeGen/binassign.c b/clang/test/CIR/CodeGen/binassign.c
index 44c54b4a2969a..4520063c56ee6 100644
--- a/clang/test/CIR/CodeGen/binassign.c
+++ b/clang/test/CIR/CodeGen/binassign.c
@@ -100,3 +100,107 @@ void binary_assign_struct() {
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LS_PTR]], ptr align 4 @gs, i64 8, i1 false)
 // OGCG:   call void @llvm.memcpy.p0.p0.i64(ptr align 4 %[[LSV_PTR]], ptr align 4 @gsv, i64 8, i1 true)
 // OGCG:   ret void
+
+int ignore_result_assign() {
+  int arr[10];
+  int i, j;
+  j = i = 123, 0;
+  j = arr[i = 5];
+  int *p, *q = 0;
+  if(p = q)
+    return 1;
+  return 0;
+}
+
+// CIR-LABEL: cir.func{{.*}} @ignore_result_assign() -> !s32i
+// CIR:         %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+// CIR:         %[[ARR:.*]] = cir.alloca !cir.array<!s32i x 10>, !cir.ptr<!cir.array<!s32i x 10>>, ["arr"]
+// CIR:         %[[I:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["i"]
+// CIR:         %[[J:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["j"]
+// CIR:         %[[P:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["p"]
+// CIR:         %[[Q:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["q", init]
+// CIR:         %[[VAL_123:.*]] = cir.const #cir.int<123> : !s32i
+// CIR:         cir.store{{.*}} %[[VAL_123]], %[[I]] : !s32i, !cir.ptr<!s32i>
+// CIR:         cir.store{{.*}} %[[VAL_123]], %[[J]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[VAL_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:         %[[VAL_5:.*]] = cir.const #cir.int<5> : !s32i
+// CIR:         cir.store{{.*}} %[[VAL_5]], %[[I]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[ARR_DECAY:.*]] = cir.cast array_to_ptrdecay %[[ARR]] : !cir.ptr<!cir.array<!s32i x 10>> -> !cir.ptr<!s32i>
+// CIR:         %[[ARR_ELEM:.*]] = cir.ptr_stride %[[ARR_DECAY]], %[[VAL_5]] : (!cir.ptr<!s32i>, !s32i) -> !cir.ptr<!s32i>
+// CIR:         %[[ARR_LOAD:.*]] = cir.load{{.*}} %[[ARR_ELEM]] : !cir.ptr<!s32i>, !s32i
+// CIR:         cir.store{{.*}} %[[ARR_LOAD]], %[[J]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!s32i>
+// CIR:         cir.store{{.*}} %[[NULL]], %[[Q]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:         cir.scope {
+// CIR:           %[[Q_VAL:.*]] = cir.load{{.*}} %[[Q]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
+// CIR:           cir.store{{.*}} %[[Q_VAL]], %[[P]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+// CIR:           %[[COND:.*]] = cir.cast ptr_to_bool %[[Q_VAL]] : !cir.ptr<!s32i> -> !cir.bool
+// CIR:           cir.if %[[COND]] {
+// CIR:             %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
+// CIR:             cir.store %[[ONE]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR:             %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
+// CIR:             cir.return
+// CIR:           }
+// CIR:         }
+// CIR:         %[[ZERO:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:         cir.store %[[ZERO]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR:         %{{.*}} = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
+// CIR:         cir.return
+
+// LLVM-LABEL: define {{.*}}i32 @ignore_result_assign()
+// LLVM:         %[[RETVAL_PTR:.*]] = alloca i32
+// LLVM:         %[[ARR_PTR:.*]] = alloca [10 x i32]
+// LLVM:         %[[I_PTR:.*]] = alloca i32
+// LLVM:         %[[J_PTR:.*]] = alloca i32
+// LLVM:         %[[P_PTR:.*]] = alloca ptr
+// LLVM:         %[[Q_PTR:.*]] = alloca ptr
+// LLVM:         store i32 123, ptr %[[I_PTR]]
+// LLVM:         store i32 123, ptr %[[J_PTR]]
+// LLVM:         store i32 5, ptr %[[I_PTR]]
+// LLVM:         %[[GEP1:.*]] = getelementptr i32, ptr %[[ARR_PTR]], i32 0
+// LLVM:         %[[GEP2:.*]] = getelementptr i32, ptr %[[GEP1]], i64 5
+// LLVM:         %[[ARR_VAL:.*]] = load i32, ptr %[[GEP2]]
+// LLVM:         store i32 %[[ARR_VAL]], ptr %[[J_PTR]]
+// LLVM:         store ptr null, ptr %[[Q_PTR]]
+// LLVM:         br label
+// LLVM:         %[[Q_VAL:.*]] = load ptr, ptr %[[Q_PTR]]
+// LLVM:         store ptr %[[Q_VAL]], ptr %[[P_PTR]]
+// LLVM:         %[[CMP:.*]] = icmp ne ptr %[[Q_VAL]], null
+// LLVM:         br i1 %[[CMP]], label %[[THEN:.*]], label %[[ELSE:.*]]
+// LLVM:       [[THEN]]:
+// LLVM:         store i32 1, ptr %[[RETVAL_PTR]]
+// LLVM:         %{{.*}} = load i32, ptr %[[RETVAL_PTR]]
+// LLVM:         ret i32
+// LLVM:       [[ELSE]]:
+// LLVM:         br label
+// LLVM:         store i32 0, ptr %[[RETVAL_PTR]]
+// LLVM:         %{{.*}} = load i32, ptr %[[RETVAL_PTR]]
+// LLVM:         ret i32
+
+// OGCG-LABEL: define {{.*}}i32 @ignore_result_assign()
+// OGCG:         %[[RETVAL:.*]] = alloca i32
+// OGCG:         %[[ARR:.*]] = alloca [10 x i32]
+// OGCG:         %[[I:.*]] = alloca i32
+// OGCG:         %[[J:.*]] = alloca i32
+// OGCG:         %[[P:.*]] = alloca ptr
+// OGCG:         %[[Q:.*]] = alloca ptr
+// OGCG:         store i32 123, ptr %[[I]]
+// OGCG:         store i32 123, ptr %[[J]]
+// OGCG:         store i32 5, ptr %[[I]]
+// OGCG:         %[[ARRAYIDX:.*]] = getelementptr inbounds [10 x i32], ptr %[[ARR]], i64 0, i64 5
+// OGCG:         %[[ARR_VAL:.*]] = load i32, ptr %[[ARRAYIDX]]
+// OGCG:         store i32 %[[ARR_VAL]], ptr %[[J]]
+// OGCG:         store ptr null, ptr %[[Q]]
+// OGCG:         %[[Q_VAL:.*]] = load ptr, ptr %[[Q]]
+// OGCG:         store ptr %[[Q_VAL]], ptr %[[P]]
+// OGCG:         %[[TOBOOL:.*]] = icmp ne ptr %[[Q_VAL]], null
+// OGCG:         br i1 %[[TOBOOL]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
+// OGCG:       [[IF_THEN]]:
+// OGCG:         store i32 1, ptr %[[RETVAL]]
+// OGCG:         br label %[[RETURN:.*]]
+// OGCG:       [[IF_END]]:
+// OGCG:         store i32 0, ptr %[[RETVAL]]
+// OGCG:         br label %[[RETURN]]
+// OGCG:       [[RETURN]]:
+// OGCG:         %{{.*}} = load i32, ptr %[[RETVAL]]
+// OGCG:         ret i32
diff --git a/clang/test/CIR/CodeGen/builtins-floating-point.c b/clang/test/CIR/CodeGen/builtins-floating-point.c
index 193cc172d37d2..1b7de650662c7 100644
--- a/clang/test/CIR/CodeGen/builtins-floating-point.c
+++ b/clang/test/CIR/CodeGen/builtins-floating-point.c
@@ -7,14 +7,42 @@
 
 float cosf(float f) {
   return __builtin_cosf(f);
-  // CHECK: %{{.*}} = cir.cos {{.*}} : !cir.float
+  // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.float
   // LLVM: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
   // OGCG: %{{.*}} = call float @llvm.cos.f32(float %{{.*}})
 }
 
 double cos(double f) {
   return __builtin_cos(f);
-  // CIR: {{.+}} = cir.cos {{.+}} : !cir.double
+  // CIR: %{{.*}} = cir.cos %{{.*}} : !cir.double
   // LLVM: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
   // OGCG: %{{.*}} = call double @llvm.cos.f64(double %{{.*}})
 }
+
+float ceil(float f) {
+  return __builtin_ceilf(f);
+  // CIR: %{{.*}} = cir.ceil %{{.*}} : !cir.float
+  // LLVM: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
+  // OGCG: %{{.*}} = call float @llvm.ceil.f32(float %{{.*}})
+}
+
+float expf(float f) {
+  return __builtin_expf(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.float
+  // LLVM: %{{.*}} = call float @llvm.exp.f32(float %{{.*}})
+  // OGCG: %{{.*}} = call float @llvm.exp.f32(float %{{.*}})
+}
+
+double exp(double f) {
+  return __builtin_exp(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.double
+  // LLVM: %{{.*}} = call double @llvm.exp.f64(double %{{.*}})
+  // OGCG: %{{.*}} = call double @llvm.exp.f64(double %{{.*}})
+}
+
+long double expl(long double f) {
+  return __builtin_expl(f);
+  // CIR: %{{.*}} = cir.exp {{.*}} : !cir.long_double<!cir.f128>
+  // LLVM: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}})
+  // OGCG: %{{.*}} = call fp128 @llvm.exp.f128(fp128 %{{.*}})
+}
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index c8db71498e477..ee543001025e7 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -344,3 +344,47 @@ void struct_with_const_member_expr() {
 // OGCG: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0
 // OGCG: store i8 %[[BF_SET]], ptr %[[REF_ADDR]], align 4
 // OGCG: store i32 0, ptr %[[A_ADDR]], align 4
+
+void function_arg_with_default_value(CompleteS a = {1, 2}) {}
+
+// CIR: %[[ARG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
+// CIR: cir.store %{{.*}}, %[[ARG_ADDR]] : !rec_CompleteS, !cir.ptr<!rec_CompleteS>
+
+// LLVM: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: store %struct.CompleteS %{{.*}}, ptr %[[ARG_ADDR]], align 4
+
+// OGCG: %[[ARG_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: store i64 %{{.*}}, ptr %[[ARG_ADDR]], align 4
+
+void calling_function_with_default_values() {
+  function_arg_with_default_value();
+}
+
+// CIR: %[[AGG_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["agg.tmp0"]
+// CIR: %[[ELEM_0_PTR:.*]] = cir.get_member %[[AGG_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
+// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
+// CIR: cir.store{{.*}} %[[CONST_1]], %[[ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[ELEM_1_PTR:.*]] = cir.get_member %[[AGG_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
+// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
+// CIR: %[[CONST_2_I8:.*]] = cir.cast integral %[[CONST_2]] : !s32i -> !s8i
+// CIR: cir.store{{.*}} %[[CONST_2_I8]], %[[ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i>
+// CIR: %[[TMP_AGG:.*]] = cir.load{{.*}} %[[AGG_ADDR]] : !cir.ptr<!rec_CompleteS>, !rec_CompleteS
+// CIR: cir.call @_Z31function_arg_with_default_value9CompleteS(%[[TMP_AGG]]) : (!rec_CompleteS) -> ()
+
+// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
+
+// LLVM: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: %[[ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0
+// LLVM: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// LLVM: %[[ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1
+// LLVM: store i8 2, ptr %[[ELEM_1_PTR]], align 4
+// LLVM: %[[TMP_AGG:.*]] = load %struct.CompleteS, ptr %[[AGG_ADDR]], align 4
+// LLVM: call void @_Z31function_arg_with_default_value9CompleteS(%struct.CompleteS %[[TMP_AGG]])
+
+// OGCG: %[[AGG_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: %[[ELEM_0_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 0
+// OGCG: store i32 1, ptr %[[ELEM_0_PTR]], align 4
+// OGCG: %[[ELEM_1_PTR:.*]] = getelementptr inbounds nuw %struct.CompleteS, ptr %[[AGG_ADDR]], i32 0, i32 1
+// OGCG: store i8 2, ptr %[[ELEM_1_PTR]], align 4
+// OGCG: %[[TMP_AGG:.*]] = load i64, ptr %[[AGG_ADDR]], align 4
+// OGCG: call void @_Z31function_arg_with_default_value9CompleteS(i64 %[[TMP_AGG]])
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..a0d5845208529
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,131 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..25d0991fa70cd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,111 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..9fc332a1469ff
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,135 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_m(vm, vs2, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..67a9220bd011d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v.c
@@ -0,0 +1,234 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..fd6f82db52953
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_m(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_m(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..0e769ed5fc5bc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,131 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..3df1eaa3a0467
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,111 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                     size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..6179dbe8d82e4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4(vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2(vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1(vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2(vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4(vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8(vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> poison, <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_m(vbool64_t vm, vbfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> poison, <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_m(vbool32_t vm, vbfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> poison, <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_m(vbool16_t vm, vbfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> poison, <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_m(vbool8_t vm, vbfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> poison, <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_m(vbool4_t vm, vbfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> poison, <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_m(vbool2_t vm, vbfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexp(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..1ddbb0b84520c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v.c
@@ -0,0 +1,234 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4(vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2(vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1(vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2(vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4(vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8(vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2(vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1(vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2(vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4(vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8(vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> poison, <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_m(vbool64_t vm, vfloat16mf4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> poison, <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_m(vbool32_t vm, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> poison, <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_m(vbool16_t vm, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> poison, <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_m(vbool8_t vm, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> poison, <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_m(vbool4_t vm, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_m(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> poison, <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_m(vbool2_t vm, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> poison, <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_m(vbool64_t vm, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> poison, <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_m(vbool32_t vm, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> poison, <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_m(vbool16_t vm, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> poison, <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_m(vbool8_t vm, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_m(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> poison, <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_m(vbool4_t vm, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..165879a8bb589
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,90 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1(vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2(vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4(vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8(vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_m(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> poison, <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_m(vbool64_t vm, vfloat64m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_m(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> poison, <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_m(vbool32_t vm, vfloat64m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_m(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> poison, <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_m(vbool16_t vm, vfloat64m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_m(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> poison, <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_m(vbool8_t vm, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa(vm, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..aed6d87a4b18a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,248 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f16m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..374f324cc0808
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,208 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_f32m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..aec0b9f934ab9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,248 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd, vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd, vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd, vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd, vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd, vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd, vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_v_bf16m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..b6870264251cc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v.c
@@ -0,0 +1,448 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd, vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd, vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd, vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd, vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd, vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd, vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f16m8_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd, vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32mf2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd, vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd, vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd, vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd, vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f32m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..8638dc232cf01
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/non-overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,167 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tu(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tu(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tu(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd, vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m1_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd, vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m2_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd, vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m4_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd, vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_v_f64m8_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
new file mode 100644
index 0000000000000..4ceeb7b35629c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_16.c
@@ -0,0 +1,261 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd,
+                                         vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd,
+                                         vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd,
+                                       vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd,
+                                       vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd,
+                                       vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd,
+                                       vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd,
+                                          vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd,
+                                          vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd,
+                                        vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd,
+                                        vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd,
+                                        vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd,
+                                        vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexp_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexp_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd,
+                                        vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexp_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexp_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd,
+                                        vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexp_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexp_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd,
+                                      vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexp_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexp_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd,
+                                      vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexp_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexp_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd,
+                                      vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexp_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexp_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd,
+                                      vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
new file mode 100644
index 0000000000000..e08d6c5b371cc
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_32.c
@@ -0,0 +1,228 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +xsfvfexp32e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                      size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                         vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                       vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                       vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                       vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                       vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                          vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                        vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                        vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                        vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                        vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexp_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexp_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                        vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexp_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexp_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                      vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexp_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexp_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                      vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexp_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexp_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                      vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexp_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexp_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                      vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
new file mode 100644
index 0000000000000..14570d465bea8
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexp_v_bf.c
@@ -0,0 +1,272 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64x -target-feature +zve32f \
+// RUN:   -target-feature +zvfbfmin -target-feature +xsfvfbfexp16e -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tu(vbfloat16mf4_t vd, vbfloat16mf4_t vs2,
+                                          size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tu(vbfloat16mf2_t vd, vbfloat16mf2_t vs2,
+                                          size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tu(vbfloat16m1_t vd, vbfloat16m1_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tu(vbfloat16m2_t vd, vbfloat16m2_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tu(vbfloat16m4_t vd, vbfloat16m4_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tu(vbfloat16m8_t vd, vbfloat16m8_t vs2,
+                                        size_t vl) {
+  return __riscv_sf_vfexp_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tum(vbool64_t vm, vbfloat16mf4_t vd,
+                                           vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tum(vbool32_t vm, vbfloat16mf2_t vd,
+                                           vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tum(vbool16_t vm, vbfloat16m1_t vd,
+                                         vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tum(vbool8_t vm, vbfloat16m2_t vd,
+                                         vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tum(vbool4_t vm, vbfloat16m4_t vd,
+                                         vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tum(vbool2_t vm, vbfloat16m8_t vd,
+                                         vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_tumu(vbool64_t vm, vbfloat16mf4_t vd,
+                                            vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_tumu(vbool32_t vm, vbfloat16mf2_t vd,
+                                            vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_tumu(vbool16_t vm, vbfloat16m1_t vd,
+                                          vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_tumu(vbool8_t vm, vbfloat16m2_t vd,
+                                          vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_tumu(vbool4_t vm, vbfloat16m4_t vd,
+                                          vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_tumu(vbool2_t vm, vbfloat16m8_t vd,
+                                          vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x bfloat> @test_sf_vfexp_v_bf16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x bfloat> [[VD:%.*]], <vscale x 1 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16.i64(<vscale x 1 x bfloat> [[VD]], <vscale x 1 x bfloat> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x bfloat> [[TMP0]]
+//
+vbfloat16mf4_t test_sf_vfexp_v_bf16mf4_mu(vbool64_t vm, vbfloat16mf4_t vd,
+                                          vbfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x bfloat> @test_sf_vfexp_v_bf16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x bfloat> [[VD:%.*]], <vscale x 2 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16.i64(<vscale x 2 x bfloat> [[VD]], <vscale x 2 x bfloat> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x bfloat> [[TMP0]]
+//
+vbfloat16mf2_t test_sf_vfexp_v_bf16mf2_mu(vbool32_t vm, vbfloat16mf2_t vd,
+                                          vbfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x bfloat> @test_sf_vfexp_v_bf16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x bfloat> [[VD:%.*]], <vscale x 4 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16.i64(<vscale x 4 x bfloat> [[VD]], <vscale x 4 x bfloat> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x bfloat> [[TMP0]]
+//
+vbfloat16m1_t test_sf_vfexp_v_bf16m1_mu(vbool16_t vm, vbfloat16m1_t vd,
+                                        vbfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x bfloat> @test_sf_vfexp_v_bf16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x bfloat> [[VD:%.*]], <vscale x 8 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16.i64(<vscale x 8 x bfloat> [[VD]], <vscale x 8 x bfloat> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
+//
+vbfloat16m2_t test_sf_vfexp_v_bf16m2_mu(vbool8_t vm, vbfloat16m2_t vd,
+                                        vbfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x bfloat> @test_sf_vfexp_v_bf16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x bfloat> [[VD:%.*]], <vscale x 16 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16.i64(<vscale x 16 x bfloat> [[VD]], <vscale x 16 x bfloat> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x bfloat> [[TMP0]]
+//
+vbfloat16m4_t test_sf_vfexp_v_bf16m4_mu(vbool4_t vm, vbfloat16m4_t vd,
+                                        vbfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vfexp_v_bf16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x bfloat> [[VD:%.*]], <vscale x 32 x bfloat> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  [[ENTRY:.*:]]
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16.i64(<vscale x 32 x bfloat> [[VD]], <vscale x 32 x bfloat> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vfexp_v_bf16m8_mu(vbool2_t vm, vbfloat16m8_t vd,
+                                        vbfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexp_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
new file mode 100644
index 0000000000000..4ac5cfc360551
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v.c
@@ -0,0 +1,492 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +zve64f -target-feature +zvfh \
+// RUN:   -target-feature +xsfvfexpa -disable-O0-optnone  \
+// RUN:   -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tu(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tu(vfloat16mf4_t vd, vfloat16mf4_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tu(vfloat16mf2_t vd, vfloat16mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tu(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tu(vfloat16m1_t vd, vfloat16m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tu(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tu(vfloat16m2_t vd, vfloat16m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tu(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tu(vfloat16m4_t vd, vfloat16m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tu(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tu(vfloat16m8_t vd, vfloat16m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tu(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tu(vfloat32mf2_t vd, vfloat32mf2_t vs2,
+                                         size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tu(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tu(vfloat32m1_t vd, vfloat32m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tu(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tu(vfloat32m2_t vd, vfloat32m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tu(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tu(vfloat32m4_t vd, vfloat32m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tu(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tu(vfloat32m8_t vd, vfloat32m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tum(vbool64_t vm, vfloat16mf4_t vd,
+                                          vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tum(vbool32_t vm, vfloat16mf2_t vd,
+                                          vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tum(vbool16_t vm, vfloat16m1_t vd,
+                                        vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tum(vbool8_t vm, vfloat16m2_t vd,
+                                        vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tum(vbool4_t vm, vfloat16m4_t vd,
+                                        vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tum(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tum(vbool2_t vm, vfloat16m8_t vd,
+                                        vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tum(vbool64_t vm, vfloat32mf2_t vd,
+                                          vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tum(vbool32_t vm, vfloat32m1_t vd,
+                                        vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tum(vbool16_t vm, vfloat32m2_t vd,
+                                        vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tum(vbool8_t vm, vfloat32m4_t vd,
+                                        vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tum(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tum(vbool4_t vm, vfloat32m8_t vd,
+                                        vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_tumu(vbool64_t vm, vfloat16mf4_t vd,
+                                           vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_tumu(vbool32_t vm, vfloat16mf2_t vd,
+                                           vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_tumu(vbool16_t vm, vfloat16m1_t vd,
+                                         vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_tumu(vbool8_t vm, vfloat16m2_t vd,
+                                         vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_tumu(vbool4_t vm, vfloat16m4_t vd,
+                                         vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_tumu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_tumu(vbool2_t vm, vfloat16m8_t vd,
+                                         vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_tumu(vbool64_t vm, vfloat32mf2_t vd,
+                                           vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_tumu(vbool32_t vm, vfloat32m1_t vd,
+                                         vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_tumu(vbool16_t vm, vfloat32m2_t vd,
+                                         vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_tumu(vbool8_t vm, vfloat32m4_t vd,
+                                         vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_tumu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_tumu(vbool4_t vm, vfloat32m8_t vd,
+                                         vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_sf_vfexpa_v_f16mf4_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x half> [[VD:%.*]], <vscale x 1 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16.i64(<vscale x 1 x half> [[VD]], <vscale x 1 x half> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x half> [[TMP0]]
+//
+vfloat16mf4_t test_sf_vfexpa_v_f16mf4_mu(vbool64_t vm, vfloat16mf4_t vd,
+                                         vfloat16mf4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_sf_vfexpa_v_f16mf2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x half> [[VD:%.*]], <vscale x 2 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16.i64(<vscale x 2 x half> [[VD]], <vscale x 2 x half> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x half> [[TMP0]]
+//
+vfloat16mf2_t test_sf_vfexpa_v_f16mf2_mu(vbool32_t vm, vfloat16mf2_t vd,
+                                         vfloat16mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_sf_vfexpa_v_f16m1_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x half> [[VD:%.*]], <vscale x 4 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16.i64(<vscale x 4 x half> [[VD]], <vscale x 4 x half> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x half> [[TMP0]]
+//
+vfloat16m1_t test_sf_vfexpa_v_f16m1_mu(vbool16_t vm, vfloat16m1_t vd,
+                                       vfloat16m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_sf_vfexpa_v_f16m2_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x half> [[VD:%.*]], <vscale x 8 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16.i64(<vscale x 8 x half> [[VD]], <vscale x 8 x half> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP0]]
+//
+vfloat16m2_t test_sf_vfexpa_v_f16m2_mu(vbool8_t vm, vfloat16m2_t vd,
+                                       vfloat16m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_sf_vfexpa_v_f16m4_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x half> [[VD:%.*]], <vscale x 16 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16.i64(<vscale x 16 x half> [[VD]], <vscale x 16 x half> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP0]]
+//
+vfloat16m4_t test_sf_vfexpa_v_f16m4_mu(vbool4_t vm, vfloat16m4_t vd,
+                                       vfloat16m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vfexpa_v_f16m8_mu(
+// CHECK-RV64-SAME: <vscale x 32 x i1> [[VM:%.*]], <vscale x 32 x half> [[VD:%.*]], <vscale x 32 x half> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16.i64(<vscale x 32 x half> [[VD]], <vscale x 32 x half> [[VS2]], <vscale x 32 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vfexpa_v_f16m8_mu(vbool2_t vm, vfloat16m8_t vd,
+                                       vfloat16m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_sf_vfexpa_v_f32mf2_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x float> [[VD:%.*]], <vscale x 1 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32.i64(<vscale x 1 x float> [[VD]], <vscale x 1 x float> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x float> [[TMP0]]
+//
+vfloat32mf2_t test_sf_vfexpa_v_f32mf2_mu(vbool64_t vm, vfloat32mf2_t vd,
+                                         vfloat32mf2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_sf_vfexpa_v_f32m1_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x float> [[VD:%.*]], <vscale x 2 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32.i64(<vscale x 2 x float> [[VD]], <vscale x 2 x float> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x float> [[TMP0]]
+//
+vfloat32m1_t test_sf_vfexpa_v_f32m1_mu(vbool32_t vm, vfloat32m1_t vd,
+                                       vfloat32m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_sf_vfexpa_v_f32m2_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x float> [[VD:%.*]], <vscale x 4 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32.i64(<vscale x 4 x float> [[VD]], <vscale x 4 x float> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+//
+vfloat32m2_t test_sf_vfexpa_v_f32m2_mu(vbool16_t vm, vfloat32m2_t vd,
+                                       vfloat32m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_sf_vfexpa_v_f32m4_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x float> [[VD:%.*]], <vscale x 8 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32.i64(<vscale x 8 x float> [[VD]], <vscale x 8 x float> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP0]]
+//
+vfloat32m4_t test_sf_vfexpa_v_f32m4_mu(vbool8_t vm, vfloat32m4_t vd,
+                                       vfloat32m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vfexpa_v_f32m8_mu(
+// CHECK-RV64-SAME: <vscale x 16 x i1> [[VM:%.*]], <vscale x 16 x float> [[VD:%.*]], <vscale x 16 x float> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32.i64(<vscale x 16 x float> [[VD]], <vscale x 16 x float> [[VS2]], <vscale x 16 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vfexpa_v_f32m8_mu(vbool4_t vm, vfloat32m8_t vd,
+                                       vfloat32m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c
new file mode 100644
index 0000000000000..d0faaee571122
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/policy/overloaded/sf_vfexpa_v_64.c
@@ -0,0 +1,183 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfvfexpa64e \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | opt -S -passes=mem2reg | \
+// RUN:   FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tu(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tu(vfloat64m1_t vd, vfloat64m1_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tu(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tu(vfloat64m2_t vd, vfloat64m2_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tu(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tu(vfloat64m4_t vd, vfloat64m4_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tu(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tu(vfloat64m8_t vd, vfloat64m8_t vs2,
+                                       size_t vl) {
+  return __riscv_sf_vfexpa_tu(vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tum(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tum(vbool64_t vm, vfloat64m1_t vd,
+                                        vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tum(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tum(vbool32_t vm, vfloat64m2_t vd,
+                                        vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tum(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tum(vbool16_t vm, vfloat64m4_t vd,
+                                        vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tum(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tum(vbool8_t vm, vfloat64m8_t vd,
+                                        vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tum(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_tumu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_tumu(vbool64_t vm, vfloat64m1_t vd,
+                                         vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_tumu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_tumu(vbool32_t vm, vfloat64m2_t vd,
+                                         vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_tumu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_tumu(vbool16_t vm, vfloat64m4_t vd,
+                                         vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_tumu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 0)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_tumu(vbool8_t vm, vfloat64m8_t vd,
+                                         vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_tumu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_sf_vfexpa_v_f64m1_mu(
+// CHECK-RV64-SAME: <vscale x 1 x i1> [[VM:%.*]], <vscale x 1 x double> [[VD:%.*]], <vscale x 1 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64.i64(<vscale x 1 x double> [[VD]], <vscale x 1 x double> [[VS2]], <vscale x 1 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 1 x double> [[TMP0]]
+//
+vfloat64m1_t test_sf_vfexpa_v_f64m1_mu(vbool64_t vm, vfloat64m1_t vd,
+                                       vfloat64m1_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_sf_vfexpa_v_f64m2_mu(
+// CHECK-RV64-SAME: <vscale x 2 x i1> [[VM:%.*]], <vscale x 2 x double> [[VD:%.*]], <vscale x 2 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64.i64(<vscale x 2 x double> [[VD]], <vscale x 2 x double> [[VS2]], <vscale x 2 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP0]]
+//
+vfloat64m2_t test_sf_vfexpa_v_f64m2_mu(vbool32_t vm, vfloat64m2_t vd,
+                                       vfloat64m2_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_sf_vfexpa_v_f64m4_mu(
+// CHECK-RV64-SAME: <vscale x 4 x i1> [[VM:%.*]], <vscale x 4 x double> [[VD:%.*]], <vscale x 4 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64.i64(<vscale x 4 x double> [[VD]], <vscale x 4 x double> [[VS2]], <vscale x 4 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP0]]
+//
+vfloat64m4_t test_sf_vfexpa_v_f64m4_mu(vbool16_t vm, vfloat64m4_t vd,
+                                       vfloat64m4_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vfexpa_v_f64m8_mu(
+// CHECK-RV64-SAME: <vscale x 8 x i1> [[VM:%.*]], <vscale x 8 x double> [[VD:%.*]], <vscale x 8 x double> [[VS2:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64.i64(<vscale x 8 x double> [[VD]], <vscale x 8 x double> [[VS2]], <vscale x 8 x i1> [[VM]], i64 [[VL]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vfexpa_v_f64m8_mu(vbool8_t vm, vfloat64m8_t vd,
+                                       vfloat64m8_t vs2, size_t vl) {
+  return __riscv_sf_vfexpa_mu(vm, vd, vs2, vl);
+}
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index be2cd480f7558..e6e2e38bcc097 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -1591,6 +1591,132 @@ __m512i test_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i
   return _mm512_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
 
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 400, 10, 410, 20, 420, 30, 430,
+    40, 440, 50, 450, 60, 460, 70, 470,
+    80, 480, 90, 490, 100, 500, 110, 510,
+    120, 520, 130, 530, 140, 540, 150, 550));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            -1, -2, -3, -4, -5, -6, -7, -8,
+            -9, -10, -11, -12, -13, -14, -15, -16,
+            -17, -18, -19, -20, -21, -22, -23, -24,
+            -25, -26, -27, -28, -29, -30, -31, -32},
+        0xAAAAAAAA,
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    -1, 400, -3, 410, -5, 420, -7, 430,
+    -9, 440, -11, 450, -13, 460, -15, 470,
+    -17, 480, -19, 490, -21, 500, -23, 510,
+    -25, 520, -27, 530, -29, 540, -31, 550));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_maskz_permutex2var_epi16(
+        0x55555555,
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 0, 10, 0, 20, 0, 30, 0,
+    40, 0, 50, 0, 60, 0, 70, 0,
+    80, 0, 90, 0, 100, 0, 110, 0,
+    120, 0, 130, 0, 140, 0, 150, 0));
+
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_permutex2var_epi8(
+        (__m512i)(__v64qu){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 127, 126, 125,
+            124, 123, 122, 121, 120, 119, 118, 117,
+            116, 115, 114, 113, 112, 111, 110, 109,
+            108, 107, 106, 105, 104, 103, 102, 101,
+            100, 99, 98, 97, 96, 95, 94, 93,
+            92, 91, 90, 89, 88, 87, 86, 85,
+            84, 83, 82, 81, 80, 79, 78, 77},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 210, 220, 230, 240, 250, 254, 253,
+            252, 251, 250, 249, 248, 247, 246, 245,
+            244, 243, 242, 241, 240, 239, 238, 237,
+            236, 235, 234, 233, 232, 231, 230, 229,
+            228, 227, 226, 225, 224, 223, 222, 221,
+            220, 219, 218, 217, 216, 215, 214, 213,
+            212, 211, 210, 209, 208, 207, 206, 205,
+            204, 203, 202, 201, 200, 199, 198, 197}),
+    0, 200, 10, 210, 20, 220, 30, 230,
+    40, 240, 50, 250, 60, 254, 70, 253,
+    80, 252, 90, 251, 100, 250, 110, 249,
+    120, 248, 127, 247, 126, 246, 125, 245,
+    124, 244, 123, 243, 122, 242, 121, 241,
+    120, 240, 119, 239, 118, 238, 117, 237,
+    116, 236, 115, 235, 114, 234, 113, 233,
+    112, 232, 111, 231, 110, 230, 109, 229));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask2_permutex2var_epi16(
+        (__m512i)(__v32hi){
+            0, 10, 20, 30, 40, 50, 60, 70,
+            80, 90, 100, 110, 120, 130, 140, 150,
+            160, 170, 180, 190, 200, 210, 220, 230,
+            240, 250, 260, 270, 280, 290, 300, 310},
+        (__m512i)(__v32hi){
+            0, 32, 1, 33, 2, 34, 3, 35,
+            4, 36, 5, 37, 6, 38, 7, 39,
+            8, 40, 9, 41, 10, 42, 11, 43,
+            12, 44, 13, 45, 14, 46, 15, 47},
+        0x55555555,
+        (__m512i)(__v32hi){
+            400, 410, 420, 430, 440, 450, 460, 470,
+            480, 490, 500, 510, 520, 530, 540, 550,
+            560, 570, 580, 590, 600, 610, 620, 630,
+            640, 650, 660, 670, 680, 690, 700, 710}),
+    0, 32, 10, 33, 20, 34, 30, 35,
+    40, 36, 50, 37, 60, 38, 70, 39,
+    80, 40, 90, 41, 100, 42, 110, 43,
+    120, 44, 130, 45, 140, 46, 150, 47));
+
 __m512i test_mm512_mulhrs_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mulhrs_epi16
   // CHECK: @llvm.x86.avx512.pmul.hr.sw.512
@@ -2578,6 +2704,33 @@ __m512i test_mm512_broadcastw_epi16(__m128i __A) {
   return _mm512_broadcastw_epi16(__A);
 }
 TEST_CONSTEXPR(match_v32hi(_mm512_broadcastw_epi16((__m128i)(__v8hi){42, 3, 10, 8, 0, 256, 256, 128}), 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_permutex2var_epi16((__m512i)(__v32hi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+                                                  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+                              (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34,
+                                                  3, 35, 4, 36, 5, 37, 6, 38,
+                                                  7, 39, 8, 40, 9, 41, 10, 42,
+                                                  11, 43, 12, 44, 13, 45, 14, 46},
+                              (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+                                                  117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}),
+    1, 32, 101, 132, 2, 102, 3, 103,
+    4, 104, 5, 105, 6, 106, 7, 107,
+    8, 108, 9, 109, 10, 110, 11, 111,
+    12, 112, 13, 113, 14, 114, 15, 115));
+TEST_CONSTEXPR(match_v32hi(
+    _mm512_mask_permutex2var_epi16((__m512i)(__v32hi){-1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16,
+                                                        -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, -30, -31, -32},
+                                    0xAAAAAAAA,
+                                    (__m512i)(__v32hi){0, 31, 32, 63, 1, 33, 2, 34,
+                                                        3, 35, 4, 36, 5, 37, 6, 38,
+                                                        7, 39, 8, 40, 9, 41, 10, 42,
+                                                        11, 43, 12, 44, 13, 45, 14, 46},
+                                    (__m512i)(__v32hi){101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+                                                        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132}),
+    -1, -32, -3, 132, -5, 102, -7, 103,
+    -9, 104, -11, 105, -13, 106, -15, 107,
+    -17, 108, -19, 109, -21, 110, -23, 111,
+    -25, 112, -27, 113, -29, 114, -31, 115));
 
 __m512i test_mm512_mask_broadcastw_epi16(__m512i __O, __mmask32 __M, __m128i __A) {
   // CHECK-LABEL: test_mm512_mask_broadcastw_epi16
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 69599379b6b3d..8e65430bd3e84 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -5607,6 +5607,56 @@ __m512i test_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i _
   // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
   return _mm512_maskz_permutex2var_epi64(__U, __A, __I, __B);
 }
+
+TEST_CONSTEXPR(match_v16si(
+    _mm512_permutex2var_epi32(
+        (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 150, 200, 350, 10, 210, 20, 220,
+    30, 230, 40, 240, 50, 250, 60, 260));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_mask_permutex2var_epi32(
+        (__m512i)(__v16si){-1, -2, -3, -4, -5, -6, -7, -8,
+                           -9, -10, -11, -12, -13, -14, -15, -16},
+        0xAAAA,
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    -1, -16, -3, 350, -5, 210, -7, 220,
+    -9, 230, -11, 240, -13, 250, -15, 260));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_maskz_permutex2var_epi32(
+        0x5555,
+        (__m512i)(__v16si){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512i)(__v16si){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 0, 200, 0, 10, 0, 20, 0,
+    30, 0, 40, 0, 50, 0, 60, 0));
+TEST_CONSTEXPR(match_m512(
+    _mm512_permutex2var_ps(
+        (__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
+                 9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f},
+        (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                 109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f,
+    4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_permutex2var_pd(
+        (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+        (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+        (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0));
+
 __mmask16 test_mm512_testn_epi32_mask(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_testn_epi32_mask
   // CHECK: and <16 x i32> %{{.*}}, %{{.*}}
@@ -11753,3 +11803,73 @@ void test_mm512_mask_i32loscatter_epi64(void *__addr, __mmask8 __mask, __m512i _
   // CHECK: @llvm.x86.avx512.mask.scatter.dpq.512
   _mm512_mask_i32loscatter_epi64(__addr, __mask, __index, __v1, 2);
 }
+
+
+TEST_CONSTEXPR(match_m512d(
+    _mm512_permutex2var_pd((__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                           (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                           (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 108.0, 1.0, 8.0, 2.0, 2.0, 3.0, 3.0));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_mask_permutex2var_pd((__m512d){-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
+                                 0xAA,
+                                 (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                 (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    -1.0, 108.0, -3.0, -8.0, -5.0, -2.0, -7.0, -3.0));
+TEST_CONSTEXPR(match_m512d(
+    _mm512_maskz_permutex2var_pd(0x55, (__m512d){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0},
+                                 (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                 (__m512d){101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0}),
+    1.0, 0.0, 1.0, 0.0, 2.0, 0.0, 3.0, 0.0));
+
+TEST_CONSTEXPR(match_m512(
+    _mm512_permutex2var_ps((__m512){1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f,
+                                     9.f, 10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f},
+                           (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                               3, 19, 4, 20, 5, 21, 6, 22},
+                           (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                                    109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    1.f, 16.f, 101.f, 116.f, 2.f, 102.f, 3.f, 103.f,
+    4.f, 104.f, 5.f, 105.f, 6.f, 106.f, 7.f, 107.f));
+TEST_CONSTEXPR(match_m512(
+    _mm512_mask_permutex2var_ps((__m512){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f,
+                                          -9.f, -10.f, -11.f, -12.f, -13.f, -14.f, -15.f, -16.f},
+                                0xAAAA,
+                                (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                    3, 19, 4, 20, 5, 21, 6, 22},
+                                (__m512){101.f, 102.f, 103.f, 104.f, 105.f, 106.f, 107.f, 108.f,
+                                         109.f, 110.f, 111.f, 112.f, 113.f, 114.f, 115.f, 116.f}),
+    -1.f, -16.f, -3.f, 116.f, -5.f, 102.f, -7.f, 103.f,
+    -9.f, 104.f, -11.f, 105.f, -13.f, 106.f, -15.f, 107.f));
+
+TEST_CONSTEXPR(match_v16si(
+    _mm512_permutex2var_epi32((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8,
+                                                  9, 10, 11, 12, 13, 14, 15, 16},
+                              (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                  3, 19, 4, 20, 5, 21, 6, 22},
+                              (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108,
+                                                  109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 16, 101, 116, 2, 102, 3, 103,
+    4, 104, 5, 105, 6, 106, 7, 107));
+TEST_CONSTEXPR(match_v16si(
+    _mm512_maskz_permutex2var_epi32(0x5555,
+                                     (__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8,
+                                                         9, 10, 11, 12, 13, 14, 15, 16},
+                                     (__m512i)(__v16si){0, 15, 16, 31, 1, 17, 2, 18,
+                                                         3, 19, 4, 20, 5, 21, 6, 22},
+                                     (__m512i)(__v16si){101, 102, 103, 104, 105, 106, 107, 108,
+                                                         109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 0, 101, 0, 2, 0, 3, 0,
+    4, 0, 5, 0, 6, 0, 7, 0));
+
+TEST_CONSTEXPR(match_v8di(
+    _mm512_permutex2var_epi64((__m512i)(__v8di){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                              (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}),
+    1, 108, 1, 8, 2, 2, 3, 3));
+TEST_CONSTEXPR(match_v8di(
+    _mm512_mask_permutex2var_epi64((__m512i)(__v8di){-1, -2, -3, -4, -5, -6, -7, -8},
+                                    0xAA,
+                                    (__m512i)(__v8di){0, 15, 16, 23, 1, 17, 2, 18},
+                                    (__m512i)(__v8di){101, 102, 103, 104, 105, 106, 107, 108}),
+    -1, 108, -3, -8, -5, -2, -7, -3));
diff --git a/clang/test/CodeGen/X86/avx512vbmi-builtins.c b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
index c3b6298a39b59..7d506db92faeb 100644
--- a/clang/test/CodeGen/X86/avx512vbmi-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vbmi-builtins.c
@@ -3,8 +3,14 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m512i test_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask2_permutex2var_epi8
@@ -33,6 +39,154 @@ __m512i test_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i _
   return _mm512_maskz_permutex2var_epi8(__U, __A, __I, __B); 
 }
 
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_permutex2var_epi8((__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 200, 1, 201, 2, 202, 3, 203,
+    4, 204, 5, 205, 6, 206, 7, 207,
+    8, 208, 9, 209, 10, 210, 11, 211,
+    12, 212, 13, 213, 14, 214, 15, 215,
+    16, 216, 17, 217, 18, 218, 19, 219,
+    20, 220, 21, 221, 22, 222, 23, 223,
+    24, 224, 25, 225, 26, 226, 27, 227,
+    28, 228, 29, 229, 30, 230, 31, 231));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_mask_permutex2var_epi8((__m512i)(__v64qu){
+            10, 11, 12, 13, 14, 15, 16, 17,
+            18, 19, 20, 21, 22, 23, 24, 25,
+            26, 27, 28, 29, 30, 31, 32, 33,
+            34, 35, 36, 37, 38, 39, 40, 41,
+            42, 43, 44, 45, 46, 47, 48, 49,
+            50, 51, 52, 53, 54, 55, 56, 57,
+            58, 59, 60, 61, 62, 63, 64, 65,
+            66, 67, 68, 69, 70, 71, 72, 73},
+        0xAAAAAAAAAAAAAAAAULL,
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    10, 200, 12, 201, 14, 202, 16, 203,
+    18, 204, 20, 205, 22, 206, 24, 207,
+    26, 208, 28, 209, 30, 210, 32, 211,
+    34, 212, 36, 213, 38, 214, 40, 215,
+    42, 216, 44, 217, 46, 218, 48, 219,
+    50, 220, 52, 221, 54, 222, 56, 223,
+    58, 224, 60, 225, 62, 226, 64, 227,
+    66, 228, 68, 229, 70, 230, 72, 231));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_maskz_permutex2var_epi8(0x5555555555555555ULL,
+        (__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 0, 1, 0, 2, 0, 3, 0,
+    4, 0, 5, 0, 6, 0, 7, 0,
+    8, 0, 9, 0, 10, 0, 11, 0,
+    12, 0, 13, 0, 14, 0, 15, 0,
+    16, 0, 17, 0, 18, 0, 19, 0,
+    20, 0, 21, 0, 22, 0, 23, 0,
+    24, 0, 25, 0, 26, 0, 27, 0,
+    28, 0, 29, 0, 30, 0, 31, 0));
+TEST_CONSTEXPR(match_v64qu(
+    _mm512_mask2_permutex2var_epi8((__m512i)(__v64qu){
+            0, 1, 2, 3, 4, 5, 6, 7,
+            8, 9, 10, 11, 12, 13, 14, 15,
+            16, 17, 18, 19, 20, 21, 22, 23,
+            24, 25, 26, 27, 28, 29, 30, 31,
+            32, 33, 34, 35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44, 45, 46, 47,
+            48, 49, 50, 51, 52, 53, 54, 55,
+            56, 57, 58, 59, 60, 61, 62, 63},
+        (__m512i)(__v64qu){
+            0, 64, 1, 65, 2, 66, 3, 67,
+            4, 68, 5, 69, 6, 70, 7, 71,
+            8, 72, 9, 73, 10, 74, 11, 75,
+            12, 76, 13, 77, 14, 78, 15, 79,
+            16, 80, 17, 81, 18, 82, 19, 83,
+            20, 84, 21, 85, 22, 86, 23, 87,
+            24, 88, 25, 89, 26, 90, 27, 91,
+            28, 92, 29, 93, 30, 94, 31, 95},
+        0x5555555555555555ULL,
+        (__m512i)(__v64qu){
+            200, 201, 202, 203, 204, 205, 206, 207,
+            208, 209, 210, 211, 212, 213, 214, 215,
+            216, 217, 218, 219, 220, 221, 222, 223,
+            224, 225, 226, 227, 228, 229, 230, 231,
+            232, 233, 234, 235, 236, 237, 238, 239,
+            240, 241, 242, 243, 244, 245, 246, 247,
+            248, 249, 250, 251, 252, 253, 254, 255,
+            0, 1, 2, 3, 4, 5, 6, 7}),
+    0, 64, 1, 65, 2, 66, 3, 67,
+    4, 68, 5, 69, 6, 70, 7, 71,
+    8, 72, 9, 73, 10, 74, 11, 75,
+    12, 76, 13, 77, 14, 78, 15, 79,
+    16, 80, 17, 81, 18, 82, 19, 83,
+    20, 84, 21, 85, 22, 86, 23, 87,
+    24, 88, 25, 89, 26, 90, 27, 91,
+    28, 92, 29, 93, 30, 94, 31, 95));
+
 __m512i test_mm512_permutexvar_epi8(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_permutexvar_epi8
   // CHECK: call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %{{.*}}, <64 x i8> %{{.*}})
diff --git a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
index c4d5fc8fb6977..49b7a1a721195 100644
--- a/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
+++ b/clang/test/CodeGen/X86/avx512vbmivl-builtin.c
@@ -3,8 +3,14 @@
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
 // RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror | FileCheck %s
 
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=i386-apple-darwin -target-feature +avx512vbmi -target-feature +avx512vl -target-feature +avx512bw -emit-llvm -o - -Wall -Werror -fexperimental-new-constant-interpreter | FileCheck %s
+
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 __m128i test_mm_permutexvar_epi8(__m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_permutexvar_epi8
@@ -77,8 +83,28 @@ __m128i test_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi8
   // CHECK: call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
-  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B); 
-}
+  return _mm_maskz_permutex2var_epi8(__U, __A, __I, __B);
+}
+
+TEST_CONSTEXPR(match_v16qu(
+    _mm_permutex2var_epi8((__m128i)(__v16qu){1, 2, 3, 4, 5, 6, 7, 8,
+                                             9, 10, 11, 12, 13, 14, 15, 16},
+                         (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                             4, 20, 5, 21, 6, 22, 7, 23},
+                         (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                            109, 110, 111, 112, 113, 114, 115, 116}),
+    1, 101, 2, 102, 3, 103, 4, 104,
+    5, 105, 6, 106, 7, 107, 8, 108));
+TEST_CONSTEXPR(match_v16qu(
+    _mm_mask_permutex2var_epi8((__m128i)(__v16qu){200, 201, 202, 203, 204, 205, 206, 207,
+                                                   208, 209, 210, 211, 212, 213, 214, 215},
+                               0xAAAA,
+                               (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                                   4, 20, 5, 21, 6, 22, 7, 23},
+                               (__m128i)(__v16qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                  109, 110, 111, 112, 113, 114, 115, 116}),
+    200, 101, 202, 102, 204, 103, 206, 104,
+    208, 105, 210, 106, 212, 107, 214, 108));
 
 __m256i test_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi8
@@ -97,8 +123,44 @@ __m256i test_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i _
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi8
   // CHECK: call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
-  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B); 
-}
+  return _mm256_maskz_permutex2var_epi8(__U, __A, __I, __B);
+}
+
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_permutex2var_epi8((__m256i)(__v32qu){1, 2, 3, 4, 5, 6, 7, 8,
+                                                 9, 10, 11, 12, 13, 14, 15, 16,
+                                                 17, 18, 19, 20, 21, 22, 23, 24,
+                                                 25, 26, 27, 28, 29, 30, 31, 32},
+                             (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35,
+                                                 4, 36, 5, 37, 6, 38, 7, 39,
+                                                 8, 40, 9, 41, 10, 42, 11, 43,
+                                                 12, 44, 13, 45, 14, 46, 15, 47},
+                             (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                109, 110, 111, 112, 113, 114, 115, 116,
+                                                117, 118, 119, 120, 121, 122, 123, 124,
+                                                125, 126, 127, 128, 129, 130, 131, 132}),
+    1, 101, 2, 102, 3, 103, 4, 104,
+    5, 105, 6, 106, 7, 107, 8, 108,
+    9, 109, 10, 110, 11, 111, 12, 112,
+    13, 113, 14, 114, 15, 115, 16, 116));
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_mask_permutex2var_epi8((__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207,
+                                                      208, 209, 210, 211, 212, 213, 214, 215,
+                                                      216, 217, 218, 219, 220, 221, 222, 223,
+                                                      224, 225, 226, 227, 228, 229, 230, 231},
+                                  0xAAAAAAAA,
+                                  (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35,
+                                                      4, 36, 5, 37, 6, 38, 7, 39,
+                                                      8, 40, 9, 41, 10, 42, 11, 43,
+                                                      12, 44, 13, 45, 14, 46, 15, 47},
+                                  (__m256i)(__v32qu){101, 102, 103, 104, 105, 106, 107, 108,
+                                                     109, 110, 111, 112, 113, 114, 115, 116,
+                                                     117, 118, 119, 120, 121, 122, 123, 124,
+                                                     125, 126, 127, 128, 129, 130, 131, 132}),
+    200, 101, 202, 102, 204, 103, 206, 104,
+    208, 105, 210, 106, 212, 107, 214, 108,
+    216, 109, 218, 110, 220, 111, 222, 112,
+    224, 113, 226, 114, 228, 115, 230, 116));
 
 __m128i test_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_mask_multishift_epi64_epi8
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 33c43977f72dc..121d5bf8d4adb 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -5610,12 +5610,23 @@ __m128i test_mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask2_permutex2var_epi32(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask2_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                                 (__m128i)(__v4si){0, 3, 4, 6}, 0x05,
+                                 (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 3, 100, 6));
 __m256i test_mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask2_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask2_permutex2var_epi32(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask2_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                                    (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                    0xA5,
+                                    (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 7, 100, 15, 1, 110, 2, 120));
 __m128d test_mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
   // CHECK-LABEL: test_mm_mask2_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
@@ -5646,149 +5657,255 @@ __m128i test_mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask2_permutex2var_epi64(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_mask2_permutex2var_epi64((__m128i)(__v2di){10, 20},
+                                 (__m128i)(__v2di){0, 5}, 0x1,
+                                 (__m128i)(__v2di){100, 200}),
+    10, 5));
 __m256i test_mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask2_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask2_permutex2var_epi64(__A,__I,__U,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_mask2_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30},
+                                    (__m256i)(__v4di){0, 1, 4, 5}, 0x5,
+                                    (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 1, 100, 5));
 __m128i test_mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   return _mm_permutex2var_epi32(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                           (__m128i)(__v4si){0, 3, 4, 6},
+                           (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 40, 100, 300));
 __m128i test_mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_permutex2var_epi32(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A,
+                                (__m128i)(__v4si){0, 3, 4, 6},
+                                (__m128i)(__v4si){100, 200, 300, 400}),
+    -1, -4, -3, 300));
 __m128i test_mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,  __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_permutex2var_epi32(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4si(
+    _mm_maskz_permutex2var_epi32(0x0A, (__m128i)(__v4si){10, 20, 30, 40},
+                                 (__m128i)(__v4si){0, 3, 4, 6},
+                                 (__m128i)(__v4si){100, 200, 300, 400}),
+    0, 40, 0, 300));
 __m256i test_mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   return _mm256_permutex2var_epi32(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_permutex2var_epi32((__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 70, 100, 170, 10, 110, 20, 120));
 __m256i test_mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_permutex2var_epi32(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8}, 0xAA,
+                                   (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                   (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    -1, -8, -3, 170, -5, 110, -7, 120));
 __m256i test_mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi32
   // CHECK: @llvm.x86.avx512.vpermi2var.d.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_permutex2var_epi32(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v8si(
+    _mm256_maskz_permutex2var_epi32(0xAA, (__m256i)(__v8si){0, 10, 20, 30, 40, 50, 60, 70},
+                                  (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                  (__m256i)(__v8si){100, 110, 120, 130, 140, 150, 160, 170}),
+    0, 70, 0, 170, 0, 110, 0, 120));
 __m128d test_mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   return _mm_permutex2var_pd(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_permutex2var_pd((__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    1.0, 10.0));
 __m128d test_mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_mask_permutex2var_pd(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_mask_permutex2var_pd((__m128d){-1.0, -2.0}, 0x2, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    -1.0, 10.0));
 __m128d test_mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}
   return _mm_maskz_permutex2var_pd(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128d(
+    _mm_maskz_permutex2var_pd(0x2, (__m128d){1.0, 2.0}, (__m128i)(__v2di){0, 2}, (__m128d){10.0, 20.0}),
+    0.0, 10.0));
 __m256d test_mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
   // CHECK-LABEL: test_mm256_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   return _mm256_permutex2var_pd(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_permutex2var_pd((__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    1.0, 10.0, 2.0, 20.0));
 __m256d test_mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, __m256d __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_mask_permutex2var_pd(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_mask_permutex2var_pd((__m256d){-1.0, -2.0, -3.0, -4.0}, 0x2, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    -1.0, 10.0, -3.0, -4.0));
 __m256d test_mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,  __m256d __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_pd
   // CHECK: @llvm.x86.avx512.vpermi2var.pd.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}}
   return _mm256_maskz_permutex2var_pd(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256d(
+    _mm256_maskz_permutex2var_pd(0x2, (__m256d){1.0, 2.0, 3.0, 4.0}, (__m256i)(__v4di){0, 4, 1, 5}, (__m256d){10.0, 20.0, 30.0, 40.0}),
+    0.0, 10.0, 0.0, 0.0));
 __m128 test_mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   return _mm_permutex2var_ps(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_permutex2var_ps((__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    1.f, 4.f, 10.f, 30.f));
 __m128 test_mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_mask_permutex2var_ps(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_mask_permutex2var_ps((__m128){-1.f, -2.f, -3.f, -4.f}, 0x0A, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    -1.f, -4.f, -3.f, 30.f));
 __m128 test_mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.128
   // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}
   return _mm_maskz_permutex2var_ps(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m128(
+    _mm_maskz_permutex2var_ps(0x0A, (__m128){1.f, 2.f, 3.f, 4.f}, (__m128i)(__v4si){0, 3, 4, 6}, (__m128){10.f, 20.f, 30.f, 40.f}),
+    0.f, 4.f, 0.f, 30.f));
 __m256 test_mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   return _mm256_permutex2var_ps(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_permutex2var_ps((__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    0.f, 7.f, 10.f, 17.f, 1.f, 11.f, 2.f, 12.f));
 __m256 test_mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_mask_permutex2var_ps(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_mask_permutex2var_ps((__m256){-1.f, -2.f, -3.f, -4.f, -5.f, -6.f, -7.f, -8.f}, 0xAA, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    -1.f, -8.f, -3.f, 17.f, -5.f, 11.f, -7.f, 12.f));
 __m256 test_mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, __m256 __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_ps
   // CHECK: @llvm.x86.avx512.vpermi2var.ps.256
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
   return _mm256_maskz_permutex2var_ps(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_m256(
+    _mm256_maskz_permutex2var_ps(0xAA, (__m256){0.f, 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f}, (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10}, (__m256){10.f, 11.f, 12.f, 13.f, 14.f, 15.f, 16.f, 17.f}),
+    0.f, 7.f, 0.f, 17.f, 0.f, 11.f, 0.f, 12.f));
 __m128i test_mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   return _mm_permutex2var_epi64(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_permutex2var_epi64((__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    10, 200));
 __m128i test_mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_mask_permutex2var_epi64(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_mask_permutex2var_epi64((__m128i)(__v2di){-1, -2}, 0x2, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    -1, 200));
 __m128i test_mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.128
   // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}}
   return _mm_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v2di(
+    _mm_maskz_permutex2var_epi64(0x2, (__m128i)(__v2di){10, 20}, (__m128i)(__v2di){0, 3}, (__m128i)(__v2di){100, 200}),
+    0, 200));
 __m256i test_mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   return _mm256_permutex2var_epi64(__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_permutex2var_epi64((__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 10, 100, 110));
 __m256i test_mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_mask_permutex2var_epi64(__A,__U,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_mask_permutex2var_epi64((__m256i)(__v4di){-1, -2, -3, -4}, 0x5, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    -1, -2, 100, -4));
 __m256i test_mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_permutex2var_epi64
   // CHECK: @llvm.x86.avx512.vpermi2var.q.256
   // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}}
   return _mm256_maskz_permutex2var_epi64(__U,__A,__I,__B); 
 }
+TEST_CONSTEXPR(match_v4di(
+    _mm256_maskz_permutex2var_epi64(0x5, (__m256i)(__v4di){0, 10, 20, 30}, (__m256i)(__v4di){0, 1, 4, 5}, (__m256i)(__v4di){100, 110, 120, 130}),
+    0, 0, 100, 0));
 
+TEST_CONSTEXPR(match_v4si(
+    _mm_permutex2var_epi32((__m128i)(__v4si){10, 20, 30, 40},
+                           (__m128i)(__v4si){0, 3, 4, 6},
+                           (__m128i)(__v4si){100, 200, 300, 400}),
+    10, 40, 100, 300));
+TEST_CONSTEXPR(match_v4si(
+    _mm_mask_permutex2var_epi32((__m128i)(__v4si){-1, -2, -3, -4}, 0x0A,
+                                (__m128i)(__v4si){0, 3, 4, 6},
+                                (__m128i)(__v4si){100, 200, 300, 400}),
+    -1, -4, -3, 300));
 __m128i test_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
   // CHECK-LABEL: test_mm_mask_cvtepi8_epi32
   // CHECK: sext <4 x i8> %{{.*}} to <4 x i32>
@@ -10472,6 +10589,17 @@ __m256i test_mm256_maskz_shuffle_epi32(__mmask8 __U, __m256i __A) {
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0x33u, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,0,0));
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xAAu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 0,0,0,0, 0,4,0,4));
 TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_epi32(0xFFu, ((__m256i)(__v8si){0,1,2,3,4,5,6,7}), 2), 2,0,0,0, 6,4,4,4));
+TEST_CONSTEXPR(match_v8si(
+    _mm256_permutex2var_epi32((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8},
+                              (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                              (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}),
+    1, 8, 101, 108, 2, 102, 3, 103));
+TEST_CONSTEXPR(match_v8si(
+    _mm256_mask_permutex2var_epi32((__m256i)(__v8si){-1, -2, -3, -4, -5, -6, -7, -8},
+                                    0xAA,
+                                    (__m256i)(__v8si){0, 7, 8, 15, 1, 9, 2, 10},
+                                    (__m256i)(__v8si){101, 102, 103, 104, 105, 106, 107, 108}),
+    -1, -8, -3, 108, -5, 102, -7, 103));
 
 __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) {
   // CHECK-LABEL: test_mm_mask_mov_pd
diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
index febef46458ae9..172a3cb219c8a 100644
--- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c
@@ -1887,6 +1887,67 @@ __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i
   // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}
   return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); 
 }
+
+TEST_CONSTEXPR(match_v8hi(
+    _mm_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                           (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                           (__m128i)(__v8hi){100, 110, 120, 130, 140, 150, 160,
+                                             170}),
+    0, 70, 100, 170, 10, 110, 20, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_mask_permutex2var_epi16((__m128i)(__v8hi){-1, -2, -3, -4, -5, -6, -7, -8},
+                                0xAA,
+                                (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                  160, 170}),
+    -1, -8, -3, 170, -5, 110, -7, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_maskz_permutex2var_epi16(0xAA,
+                                 (__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                                 (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                 (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                   160, 170}),
+    0, 70, 0, 170, 0, 110, 0, 120));
+TEST_CONSTEXPR(match_v8hi(
+    _mm_mask2_permutex2var_epi16((__m128i)(__v8hi){0, 10, 20, 30, 40, 50, 60, 70},
+                                 (__m128i)(__v8hi){0, 7, 8, 15, 1, 9, 2, 10},
+                                 0x55,
+                                 (__m128i)(__v8hi){100, 110, 120, 130, 140, 150,
+                                                   160, 170}),
+    0, 7, 100, 15, 10, 9, 20, 10));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_permutex2var_epi16(
+        (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 150, 200, 350, 10, 210, 20, 220,
+    30, 230, 40, 240, 50, 250, 60, 260));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_mask_permutex2var_epi16(
+        (__m256i)(__v16hi){-1, -2, -3, -4, -5, -6, -7, -8,
+                           -9, -10, -11, -12, -13, -14, -15, -16},
+        0xAAAA,
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    -1, -16, -3, 350, -5, 210, -7, 220,
+    -9, 230, -11, 240, -13, 250, -15, 260));
+TEST_CONSTEXPR(match_v16hi(
+    _mm256_maskz_permutex2var_epi16(
+        0x5555,
+        (__m256i)(__v16hi){0, 10, 20, 30, 40, 50, 60, 70,
+                           80, 90, 100, 110, 120, 130, 140, 150},
+        (__m256i)(__v16hi){0, 15, 16, 31, 1, 17, 2, 18,
+                           3, 19, 4, 20, 5, 21, 6, 22},
+        (__m256i)(__v16hi){200, 210, 220, 230, 240, 250, 260, 270,
+                           280, 290, 300, 310, 320, 330, 340, 350}),
+    0, 0, 200, 0, 10, 0, 20, 0,
+    30, 0, 40, 0, 50, 0, 60, 0));
+
 __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
   // CHECK-LABEL: test_mm_mask_maddubs_epi16
   // CHECK: @llvm.x86.ssse3.pmadd.ub.sw
@@ -3596,3 +3657,22 @@ void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i _
  // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256
  _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A);
 }
+
+
+TEST_CONSTEXPR(match_v16qu(
+    _mm_permutex2var_epi8((__m128i)(__v16qu){0, 10, 20, 30, 40, 50, 60, 70,
+                                             80, 90, 100, 110, 120, 127, 126, 125},
+                         (__m128i)(__v16qu){0, 16, 1, 17, 2, 18, 3, 19,
+                                             4, 20, 5, 21, 6, 22, 7, 23},
+                         (__m128i)(__v16qu){100, 110, 120, 130, 140, 150, 160, 170,
+                                            180, 190, 200, 210, 220, 230, 240, 250}),
+    0, 100, 10, 110, 20, 120, 30, 130,
+    40, 140, 50, 150, 60, 160, 70, 170));
+TEST_CONSTEXPR(match_v32qu(
+    _mm256_permutex2var_epi8((__m256i)(__v32qu){0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 127, 126, 125, 124, 123, 122, 121, 120, 119, 118, 117, 116, 115, 114, 113, 112, 111, 110, 109},
+                             (__m256i)(__v32qu){0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47},
+                             (__m256i)(__v32qu){200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231}),
+    0, 200, 10, 201, 20, 202, 30, 203,
+    40, 204, 50, 205, 60, 206, 70, 207,
+    80, 208, 90, 209, 100, 210, 110, 211,
+    120, 212, 127, 213, 126, 214, 125, 215));
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index a56f8ba1ee385..c7cd9ffdd6966 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -118,36 +118,36 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_copysign(f,f); __builtin_copysignf(f,f); __builtin_copysignl(f,f); __builtin_copysignf128(f,f);
 
-// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+// NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+// HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.copysign.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_fabs(f);       __builtin_fabsf(f);      __builtin_fabsl(f); __builtin_fabsf128(f);
 
-// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.fabs.f128(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_frexp(f,i);    __builtin_frexpf(f,i);   __builtin_frexpl(f,i); __builtin_frexpf128(f,i);
 
-// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { double, i32 } @llvm.frexp.f64.i32(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { float, i32 } @llvm.frexp.f32.i32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { x86_fp80, i32 } @llvm.frexp.f80.i32(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare { fp128, i32 } @llvm.frexp.f128.i32(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_huge_val();    __builtin_huge_valf();   __builtin_huge_vall(); __builtin_huge_valf128();
 
@@ -165,10 +165,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_ldexp(f,f);    __builtin_ldexpf(f,f);   __builtin_ldexpl(f,f);  __builtin_ldexpf128(f,f);
 
-// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ldexp.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ldexp.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ldexp.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.ldexp.f128.i32(fp128, i32) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @ldexp(double noundef, i32 noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @ldexpf(float noundef, i32 noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @ldexpl(x86_fp80 noundef, i32 noundef) [[NOT_READNONE]]
@@ -180,7 +180,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]]
-// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
+// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]]
 // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]]
@@ -209,10 +209,10 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_pow(f,f);        __builtin_powf(f,f);       __builtin_powl(f,f); __builtin_powf128(f,f);
 
-// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.pow.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -220,12 +220,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_powi(f,f);        __builtin_powif(f,f);       __builtin_powil(f,f);
 
-// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.powi.f64.i32(double, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.powi.f32.i32(float, i32) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.powi.f80.i32(x86_fp80, i32) [[READNONE_INTRINSIC2]]
 
   /* math */
   __builtin_acos(f);       __builtin_acosf(f);      __builtin_acosl(f); __builtin_acosf128(f);
@@ -307,21 +307,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   __builtin_ceil(f);       __builtin_ceilf(f);      __builtin_ceill(f); __builtin_ceilf128(f);
 
-// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.ceil.f128(fp128) [[READNONE_INTRINSIC2]]
 
   __builtin_cos(f);        __builtin_cosf(f);       __builtin_cosl(f); __builtin_cosf128(f);
 
-// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.cos.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -362,10 +362,10 @@ __builtin_erfc(f);       __builtin_erfcf(f);      __builtin_erfcl(f); __builtin_
 
 __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_expf128(f);
 
-// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -373,10 +373,10 @@ __builtin_exp(f);        __builtin_expf(f);       __builtin_expl(f); __builtin_e
 
 __builtin_exp2(f);       __builtin_exp2f(f);      __builtin_exp2l(f); __builtin_exp2f128(f);
 
-// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp2.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -384,10 +384,10 @@ __builtin_exp2(f);       __builtin_exp2f(f);      __builtin_exp2l(f); __builtin_
 
 __builtin_exp10(f);       __builtin_exp10f(f);      __builtin_exp10l(f); __builtin_exp10f128(f);
 
-// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.exp10.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -417,22 +417,22 @@ __builtin_fdim(f,f);       __builtin_fdimf(f,f);      __builtin_fdiml(f,f); __bu
 
 __builtin_floor(f);      __builtin_floorf(f);     __builtin_floorl(f); __builtin_floorf128(f);
 
-// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.floor.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_fma(f,f,f);        __builtin_fmaf(f,f,f);       __builtin_fmal(f,f,f); __builtin_fmaf128(f,f,f);  __builtin_fmaf16(f,f,f);
 
-// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC]]
-// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.fma.f128(fp128, fp128, fp128) [[READNONE_INTRINSIC2]]
+// NO__ERRONO: declare half @llvm.fma.f16(half, half, half) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -454,25 +454,25 @@ __builtin_fma(f,f,f);        __builtin_fmaf(f,f,f);       __builtin_fmal(f,f,f);
 
 __builtin_fmax(f,f);       __builtin_fmaxf(f,f);      __builtin_fmaxl(f,f); __builtin_fmaxf128(f,f);
 
-// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.maxnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_fmin(f,f);       __builtin_fminf(f,f);      __builtin_fminl(f,f); __builtin_fminf128(f,f);
 
-// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.minnum.f128(fp128, fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_hypot(f,f);      __builtin_hypotf(f,f);     __builtin_hypotl(f,f); __builtin_hypotf128(f,f);
 
@@ -509,10 +509,10 @@ __builtin_lgamma(f);     __builtin_lgammaf(f);    __builtin_lgammal(f); __builti
 
 __builtin_llrint(f);     __builtin_llrintf(f);    __builtin_llrintl(f); __builtin_llrintf128(f);
 
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -520,10 +520,10 @@ __builtin_llrint(f);     __builtin_llrintf(f);    __builtin_llrintl(f); __builti
 
 __builtin_llround(f);    __builtin_llroundf(f);   __builtin_llroundl(f); __builtin_llroundf128(f);
 
-// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -531,10 +531,10 @@ __builtin_llround(f);    __builtin_llroundf(f);   __builtin_llroundl(f); __built
 
 __builtin_log(f);        __builtin_logf(f);       __builtin_logl(f); __builtin_logf128(f);
 
-// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -542,10 +542,10 @@ __builtin_log(f);        __builtin_logf(f);       __builtin_logl(f); __builtin_l
 
 __builtin_log10(f);      __builtin_log10f(f);     __builtin_log10l(f); __builtin_log10f128(f);
 
-// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log10.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -564,10 +564,10 @@ __builtin_log1p(f);      __builtin_log1pf(f);     __builtin_log1pl(f); __builtin
 
 __builtin_log2(f);       __builtin_log2f(f);      __builtin_log2l(f); __builtin_log2f128(f);
 
-// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.log2.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -586,10 +586,10 @@ __builtin_logb(f);       __builtin_logbf(f);      __builtin_logbl(f); __builtin_
 
 __builtin_lrint(f);      __builtin_lrintf(f);     __builtin_lrintl(f); __builtin_lrintf128(f);
 
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -597,10 +597,10 @@ __builtin_lrint(f);      __builtin_lrintf(f);     __builtin_lrintl(f); __builtin
 
 __builtin_lround(f);     __builtin_lroundf(f);    __builtin_lroundl(f);  __builtin_lroundf128(f);
 
-// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -608,14 +608,14 @@ __builtin_lround(f);     __builtin_lroundf(f);    __builtin_lroundl(f);  __built
 
 __builtin_nearbyint(f);  __builtin_nearbyintf(f); __builtin_nearbyintl(f); __builtin_nearbyintf128(f);
 
-// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.nearbyint.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_nextafter(f,f);  __builtin_nextafterf(f,f); __builtin_nextafterl(f,f); __builtin_nextafterf128(f,f);
 
@@ -663,25 +663,25 @@ __builtin_remquo(f,f,i);  __builtin_remquof(f,f,i); __builtin_remquol(f,f,i); __
 
 __builtin_rint(f);       __builtin_rintf(f);      __builtin_rintl(f); __builtin_rintf128(f);
 
-// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.rint.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_round(f);      __builtin_roundf(f);     __builtin_roundl(f); __builtin_roundf128(f);
 
-// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.round.f128(fp128) [[READNONE_INTRINSIC2]]
 
 __builtin_scalbln(f,f);    __builtin_scalblnf(f,f);   __builtin_scalblnl(f,f); __builtin_scalblnf128(f,f);
 
@@ -707,10 +707,10 @@ __builtin_scalbn(f,f);     __builtin_scalbnf(f,f);    __builtin_scalbnl(f,f); __
 
 __builtin_sin(f);        __builtin_sinf(f);       __builtin_sinl(f); __builtin_sinf128(f);
 
-// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.sin.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -747,10 +747,10 @@ __builtin_sincospi(f,d,d); __builtin_sincospif(f,fp,fp); __builtin_sincospil(f,l
 
 __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_sqrtf128(f);
 
-// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.sqrt.f128(fp128) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -791,22 +791,24 @@ __builtin_tgamma(f);     __builtin_tgammaf(f);    __builtin_tgammal(f); __builti
 
 __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin_truncf128(f);
 
-// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare fp128 @llvm.trunc.f128(fp128) [[READNONE_INTRINSIC2]]
 };
 
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
+// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // NO__ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
 // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
+// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[PURE]] = { {{.*}}memory(read){{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
diff --git a/clang/test/CodeGen/builtin-sqrt.c b/clang/test/CodeGen/builtin-sqrt.c
index 2313a68d2d0e2..3ebf2ac91ccdf 100644
--- a/clang/test/CodeGen/builtin-sqrt.c
+++ b/clang/test/CodeGen/builtin-sqrt.c
@@ -11,5 +11,5 @@ float foo(float X) {
 // HAS_ERRNO-NOT: attributes [[ATTR]] = {{{.*}} memory(none)
 
 // NO_ERRNO: declare float @llvm.sqrt.f32(float) [[ATTR:#[0-9]+]]
-// NO_ERRNO: attributes [[ATTR]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// NO_ERRNO: attributes [[ATTR]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 
diff --git a/clang/test/CodeGen/libcalls.c b/clang/test/CodeGen/libcalls.c
index 1e4b06e34aaf9..923719f3ec8e4 100644
--- a/clang/test/CodeGen/libcalls.c
+++ b/clang/test/CodeGen/libcalls.c
@@ -74,9 +74,9 @@ void test_fma(float a0, double a1, long double a2) {
 // CHECK-YES: declare float @fmaf(float noundef, float noundef, float noundef)
 // CHECK-YES: declare double @fma(double noundef, double noundef, double noundef)
 // CHECK-YES: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef)
-// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RN2:#[0-9]+]]
-// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RN2]]
-// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.fma.f32(float, float, float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.fma.f64(double, double, double) [[NUW_RNI]]
+// CHECK-NO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[NUW_RNI]]
 
 // Just checking to make sure these library functions are marked readnone
 void test_builtins(double d, float f, long double ld) {
@@ -85,9 +85,9 @@ void test_builtins(double d, float f, long double ld) {
   double atan_ = atan(d);
   long double atanl_ = atanl(ld);
   float atanf_ = atanf(f);
-// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RNI:#[0-9]+]]
-// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RNI]]
-// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.atan.f64(double) [[NUW_RN2:#[0-9]+]]
+// CHECK-NO: declare x86_fp80 @llvm.atan.f80(x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.atan.f32(float) [[NUW_RN2]]
 // CHECK-YES: declare double @atan(double noundef) [[NUW:#[0-9]+]]
 // CHECK-YES: declare x86_fp80 @atanl(x86_fp80 noundef) [[NUW]]
 // CHECK-YES: declare float @atanf(float noundef) [[NUW]]
@@ -95,9 +95,9 @@ void test_builtins(double d, float f, long double ld) {
   double atan2_ = atan2(d, 2);
   long double atan2l_ = atan2l(ld, ld);
   float atan2f_ = atan2f(f, f);
-// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RNI]]
-// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RNI]]
-// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RNI]]
+// CHECK-NO: declare double @llvm.atan2.f64(double, double) [[NUW_RN2]]
+// CHECK-NO: declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80) [[NUW_RN2]]
+// CHECK-NO: declare float @llvm.atan2.f32(float, float) [[NUW_RN2]]
 // CHECK-YES: declare double @atan2(double noundef, double noundef) [[NUW]]
 // CHECK-YES: declare x86_fp80 @atan2l(x86_fp80 noundef, x86_fp80 noundef) [[NUW]]
 // CHECK-YES: declare float @atan2f(float noundef, float noundef) [[NUW]]
@@ -124,4 +124,4 @@ void test_builtins(double d, float f, long double ld) {
 }
 
 // CHECK-YES: attributes [[NUW]] = { nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" }
-// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// CHECK-NO-DAG: attributes [[NUW_RNI]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index ad297828f48ed..d4cd6f86b3c51 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -35,27 +35,27 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   copysign(f,f); copysignf(f,f);copysignl(f,f);
 
-  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-  // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC:#[0-9]+]]
-  // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // NO__ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // HAS_ERRNO: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare double @llvm.copysign.f64(double, double) [[READNONE_INTRINSIC2:#[0-9]+]]
+  // HAS_MAYTRAP: declare float @llvm.copysign.f32(float, float) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   fabs(f);       fabsf(f);      fabsl(f);
 
-  // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC]]
-  // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC]]
+  // NO__ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // NO__ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // HAS_ERRNO: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare double @llvm.fabs.f64(double) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare float @llvm.fabs.f32(float) [[READNONE_INTRINSIC2]]
+  // HAS_MAYTRAP: declare x86_fp80 @llvm.fabs.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 
   frexp(f,i);    frexpf(f,i);   frexpl(f,i);
 
@@ -86,7 +86,7 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
   // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
   // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
   // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
-  // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]]
+  // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC:#[0-9]+]]
   // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]]
   // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]]
   // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]]
@@ -107,9 +107,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   pow(f,f);        powf(f,f);       powl(f,f);
 
-// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.pow.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.pow.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @pow(double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @powf(float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @powl(x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -206,21 +206,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   ceil(f);       ceilf(f);      ceill(f);
 
-// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.ceil.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.ceil.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.ceil.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.ceil.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.ceil.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.ceil.f80(
 
   cos(f);        cosf(f);       cosl(f);
 
-// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.cos.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.cos.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.cos.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @cos(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @cosf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @cosl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -266,9 +266,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   exp(f);        expf(f);       expl(f);
 
-// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @expf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @expl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -278,9 +278,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   exp2(f);       exp2f(f);      exp2l(f);
 
-// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.exp2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.exp2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.exp2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @exp2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @exp2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @exp2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -314,21 +314,21 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   floor(f);      floorf(f);     floorl(f);
 
-// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.floor.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.floor.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.floor.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.floor.f64
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.floor.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.floor.f80(
 
   fma(f,f,f);        fmaf(f,f,f);       fmal(f,f,f);
 
-// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.fma.f64(double, double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.fma.f32(float, float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @fma(double noundef, double noundef, double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @fmaf(float noundef, float noundef, float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @fmal(x86_fp80 noundef, x86_fp80 noundef, x86_fp80 noundef) [[NOT_READNONE]]
@@ -350,39 +350,39 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   fmax(f,f);       fmaxf(f,f);      fmaxl(f,f);
 
-// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.maxnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.maxnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.maxnum.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.maxnum.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.maxnum.f80(
 
   fmin(f,f);       fminf(f,f);      fminl(f,f);
 
-// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.minnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.minnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.minnum.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.minnum.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.minnum.f80(
 
   fmaximum_num(*d,*d);       fmaximum_numf(f,f);      fmaximum_numl(*l,*l);
 
-// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC]]
-// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC]]
-// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// COMMON: declare double @llvm.maximumnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// COMMON: declare float @llvm.maximumnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// COMMON: declare x86_fp80 @llvm.maximumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   fminimum_num(*d,*d);       fminimum_numf(f,f);      fminimum_numl(*l,*l);
 
-// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC]]
-// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC]]
-// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC]]
+// COMMON: declare double @llvm.minimumnum.f64(double, double) [[READNONE_INTRINSIC2]]
+// COMMON: declare float @llvm.minimumnum.f32(float, float) [[READNONE_INTRINSIC2]]
+// COMMON: declare x86_fp80 @llvm.minimumnum.f80(x86_fp80, x86_fp80) [[READNONE_INTRINSIC2]]
 
   hypot(f,f);      hypotf(f,f);     hypotl(f,f);
 
@@ -422,9 +422,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   llrint(f);     llrintf(f);    llrintl(f);
 
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -434,9 +434,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   llround(f);    llroundf(f);   llroundl(f);
 
-// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.llround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @llround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @llroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -446,9 +446,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log(f);        logf(f);       logl(f);
 
-// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @logf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @logl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -458,9 +458,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log10(f);      log10f(f);     log10l(f);
 
-// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log10.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log10.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log10.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log10(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log10f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log10l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -482,9 +482,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   log2(f);       log2f(f);      log2l(f);
 
-// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.log2.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.log2.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.log2.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @log2(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @log2f(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @log2l(x86_fp80 noundef) [[NOT_READNONE]]
@@ -506,9 +506,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   lrint(f);      lrintf(f);     lrintl(f);
 
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lrint.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lrint(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lrintl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -518,9 +518,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   lround(f);     lroundf(f);    lroundl(f);
 
-// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare i64 @llvm.lround.i64.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare i64 @lround(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare i64 @lroundl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -530,12 +530,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   nearbyint(f);  nearbyintf(f); nearbyintl(f);
 
-// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.nearbyint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.nearbyint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.nearbyint.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.nearbyint.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.nearbyint.f80(
@@ -590,24 +590,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   rint(f);       rintf(f);      rintl(f);
 
-// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.rint.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.rint.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.rint.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.rint.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.rint.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.rint.f80(
 
   round(f);      roundf(f);     roundl(f);
 
-// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.round.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.round.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.round.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_MAYTRAP: declare double @llvm.experimental.constrained.round.f64(
 // HAS_MAYTRAP: declare float @llvm.experimental.constrained.round.f32(
 // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.round.f80(
@@ -638,9 +638,9 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   sin(f);        sinf(f);       sinl(f);
 
-// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sin.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sin.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sin.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sin(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sinf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sinl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -674,9 +674,9 @@ sincos(f, d, d);       sincosf(f, fp, fp);        sincosl(f, l, l);
 
   sqrt(f);       sqrtf(f);      sqrtl(f);
 
-// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.sqrt.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.sqrt.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.sqrt.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 // HAS_ERRNO: declare double @sqrt(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @sqrtf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @sqrtl(x86_fp80 noundef) [[NOT_READNONE]]
@@ -722,20 +722,22 @@ sincos(f, d, d);       sincosf(f, fp, fp);        sincosl(f, l, l);
 
   trunc(f);      truncf(f);     truncl(f);
 
-// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC]]
-// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// NO__ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare double @llvm.trunc.f64(double) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare float @llvm.trunc.f32(float) [[READNONE_INTRINSIC2]]
+// HAS_ERRNO: declare x86_fp80 @llvm.trunc.f80(x86_fp80) [[READNONE_INTRINSIC2]]
 };
 
 // NO__ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
+// NO__ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
 // NO__ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
 // NO__ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
 
 // HAS_ERRNO: attributes [[NOT_READNONE]] = { nounwind {{.*}} }
+// HAS_ERRNO: attributes [[READNONE_INTRINSIC2]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READNONE_INTRINSIC]] = { {{.*}}memory(none){{.*}} }
 // HAS_ERRNO: attributes [[READONLY]] = { {{.*}}memory(read){{.*}} }
 // HAS_ERRNO: attributes [[READNONE]] = { {{.*}}memory(none){{.*}} }
diff --git a/clang/test/CodeGen/ms-empty-enum.c b/clang/test/CodeGen/ms-empty-enum.c
new file mode 100644
index 0000000000000..6c1c87b756f9a
--- /dev/null
+++ b/clang/test/CodeGen/ms-empty-enum.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -fms-extensions -triple x86_64-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fms-extensions -triple i386-windows-msvc -Wno-implicit-function-declaration -emit-llvm %s -o - | FileCheck %s
+
+typedef enum tag1 {} A;
+
+// CHECK: void @empty_enum(i32 noundef %a)
+void empty_enum(A a) {}
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
new file mode 100644
index 0000000000000..a7c01015b2015
--- /dev/null
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl
@@ -0,0 +1,95 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -disable-llvm-passes -emit-llvm -finclude-default-header -o - %s | FileCheck %s
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case1v(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <6 x float> <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 1.000000e+00, float 3.000000e+00, float 5.000000e+00>
+//
+float3x2 case1() {
+  // vec[0] = 0
+  // vec[1] = 2
+  // vec[2] = 4
+  // vec[3] = 1
+  // vec[4] = 3
+  // vec[5] = 5
+  return float3x2(0, 1, 
+                  2, 3,
+                  4, 5);
+}
+
+
+RWStructuredBuffer<float> In;
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case2v(
+// CHECK-SAME: ) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 0) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 1) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 2) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL3:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 3) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL4:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 4) #[[ATTR3]]
+// CHECK-NEXT:    [[CALL5:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN4hlsl18RWStructuredBufferIfEixEj(ptr noundef nonnull align 4 dereferenceable(8) @_ZL2In, i32 noundef 5) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[CALL]], align 4
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[TMP0]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[CALL2]], align 4
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[CALL4]], align 4
+// CHECK-NEXT:    [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[CALL1]], align 4
+// CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[CALL3]], align 4
+// CHECK-NEXT:    [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load float, ptr [[CALL5]], align 4
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT9]], float [[TMP5]], i32 5
+// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
+//
+float3x2 case2() {
+  // vec[0] = Call
+  // vec[1] = Call2
+  // vec[2] = Call4
+  // vec[3] = Call1
+  // vec[4] = Call3
+  // vec[5] = Call5
+  return float3x2(In[0], In[1], 
+                  In[2], In[3],
+                  In[4], In[5]);
+}
+
+
+// CHECK-LABEL: define hidden noundef nofpclass(nan inf) <6 x float> @_Z5case3Dv3_fS_(
+// CHECK-SAME: <3 x float> noundef nofpclass(nan inf) [[A:%.*]], <3 x float> noundef nofpclass(nan inf) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <3 x float>, align 16
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <3 x float>, align 16
+// CHECK-NEXT:    store <3 x float> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <3 x float> [[B]], ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <3 x float> [[TMP0]], i64 0
+// CHECK-NEXT:    [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[VECEXT]], i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 2
+// CHECK-NEXT:    [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 1
+// CHECK-NEXT:    [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 1
+// CHECK-NEXT:    [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 3
+// CHECK-NEXT:    [[TMP4:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 0
+// CHECK-NEXT:    [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16
+// CHECK-NEXT:    [[VECEXT9:%.*]] = extractelement <3 x float> [[TMP5]], i64 2
+// CHECK-NEXT:    [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[VECEXT9]], i32 5
+// CHECK-NEXT:    ret <6 x float> [[VECINIT10]]
+//
+float3x2 case3(float3 a, float3 b) {
+ // vec[0] = A[0]
+ // vec[1] = A[2]
+ // vec[2] = B[1]
+ // vec[3] = A[1]
+ // vec[4] = B[0]
+ // vec[5] = B[2]
+  return float3x2(a,b);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
new file mode 100644
index 0000000000000..65dba664bb5ea
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/f16tof32-builtin.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s
+
+// CHECK: define hidden noundef nofpclass(nan inf) float
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0)
+// CHECK: ret float %hlsl.f16tof32
+// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32)
+float test_scalar(uint p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <2 x float>
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0)
+// CHECK: ret <2 x float> %hlsl.f16tof32
+// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>)
+float2 test_uint2(uint2 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0)
+// CHECK: ret <3 x float> %hlsl.f16tof32
+// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>)
+float3 test_uint3(uint3 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0)
+// CHECK: ret <4 x float> %hlsl.f16tof32
+// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>)
+float4 test_uint4(uint4 p0) { return __builtin_hlsl_elementwise_f16tof32(p0); }
+
+
+
diff --git a/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
new file mode 100644
index 0000000000000..b68bc197f16c5
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/f16tof32.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -std=hlsl202x -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s
+
+// CHECK: define hidden noundef nofpclass(nan inf) float
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %0)
+// CHECK: ret float %hlsl.f16tof32
+// CHECK: declare float @llvm.dx.legacyf16tof32.i32(i32)
+float test_scalar(uint p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <2 x float>
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %0)
+// CHECK: ret <2 x float> %hlsl.f16tof32
+// CHECK: declare <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32>)
+float2 test_uint2(uint2 p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %0)
+// CHECK: ret <3 x float> %hlsl.f16tof32
+// CHECK: declare <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32>)
+float3 test_uint3(uint3 p0) { return f16tof32(p0); }
+
+// CHECK: define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) #0 {
+// CHECK: %hlsl.f16tof32 = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %0)
+// CHECK: ret <4 x float> %hlsl.f16tof32
+// CHECK: declare <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32>)
+float4 test_uint4(uint4 p0) { return f16tof32(p0); }
+
+
+
diff --git a/clang/test/CodeGenHLSL/builtins/select.hlsl b/clang/test/CodeGenHLSL/builtins/select.hlsl
index 7590b4a881259..e5169844cb3f2 100644
--- a/clang/test/CodeGenHLSL/builtins/select.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/select.hlsl
@@ -20,16 +20,6 @@ struct S test_select_infer_struct(bool cond0, struct S tVal, struct S fVal) {
   return select(cond0, tVal, fVal);
 }
 
-// CHECK-LABEL: test_select_infer_array
-// CHECK: [[TRUE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
-// CHECK: [[FALSE_VAL:%.*]] = load [3 x i32], ptr {{%.*}}, align 4
-// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, [3 x i32] [[TRUE_VAL]], [3 x i32] [[FALSE_VAL]]
-// CHECK: store [3 x i32] [[SELECT]], ptr {{%.*}}, align 4
-// CHECK: ret void
-int test_select_infer_array(bool cond, int tVal[3], int fVal[3])[3] {
-  return select(cond, tVal, fVal);
-}
-
 // CHECK-LABEL: test_select_bool_vector
 // CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
 // CHECK: ret <2 x i32> [[SELECT]]
@@ -38,24 +28,24 @@ int2 test_select_bool_vector(bool cond0, int2 tVal, int2 fVal) {
 }
 
 // CHECK-LABEL: test_select_vector_1
-// CHECK: [[SELECT:%.*]] = select <1 x i1> {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}}
+// CHECK: [[SELECT:%.*]] = select i1 {{%.*}}, <1 x i32> {{%.*}}, <1 x i32> {{%.*}}
 // CHECK: ret <1 x i32> [[SELECT]]
 int1 test_select_vector_1(bool1 cond0, int1 tVals, int1 fVals) {
-  return select<int,1>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_2
 // CHECK: [[SELECT:%.*]] = select <2 x i1> {{%.*}}, <2 x i32> {{%.*}}, <2 x i32> {{%.*}}
 // CHECK: ret <2 x i32> [[SELECT]]
 int2 test_select_vector_2(bool2 cond0, int2 tVals, int2 fVals) {
-  return select<int,2>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_3
 // CHECK: [[SELECT:%.*]] = select <3 x i1> {{%.*}}, <3 x i32> {{%.*}}, <3 x i32> {{%.*}}
 // CHECK: ret <3 x i32> [[SELECT]]
 int3 test_select_vector_3(bool3 cond0, int3 tVals, int3 fVals) {
-  return select<int,3>(cond0, tVals, fVals);
+  return select(cond0, tVals, fVals);
 }
 
 // CHECK-LABEL: test_select_vector_4
@@ -86,10 +76,54 @@ int4 test_select_vector_vector_scalar(bool4 cond0, int4 tVals, int fVal) {
 // CHECK-LABEL: test_select_vector_scalar_scalar
 // CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
 // CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer
-// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 %3, i64 0
+// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
 // CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer
 // CHECK: [[SELECT:%.*]] = select <4 x i1> {{%.*}}, <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
 // CHECK: ret <4 x i32> [[SELECT]]
 int4 test_select_vector_scalar_scalar(bool4 cond0, int tVal, int fVal) {
   return select(cond0, tVal, fVal);
 }
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_4
+// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i1> {{%.*}}, <4 x i1> {{%.*}}
+// CHECK: ret <4 x i1> [[SELECT]]
+bool4 test_select_nonbool_cond_vector_4(int4 cond0, bool4 tVal, bool4 fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_vector
+// CHECK: [[TMP0:%.*]] = load <3 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <3 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <3 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <3 x i32> [[SPLAT_SRC1]], <3 x i32> poison, <3 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <3 x i1> [[TOBOOL]], <3 x i32> [[SPLAT1]], <3 x i32> {{%.*}}
+// CHECK: ret <3 x i32> [[SELECT]]
+int3 test_select_nonbool_cond_vector_scalar_vector(int3 cond0, int tVal, int3 fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_vector_scalar
+// CHECK: [[TMP0:%.*]] = load <2 x i32>, ptr %cond0.addr, align 8
+// CHECK: [[TOBOOL:%.*]] = icmp ne <2 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <2 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <2 x i32> [[SPLAT_SRC1]], <2 x i32> poison, <2 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <2 x i1> [[TOBOOL]], <2 x i32> {{%.*}}, <2 x i32> [[SPLAT1]]
+// CHECK: ret <2 x i32> [[SELECT]]
+int2 test_select_nonbool_cond_vector_vector_scalar(int2 cond0, int2 tVal, int fVal) {
+  return select(cond0, tVal, fVal);
+}
+
+// CHECK-LABEL: test_select_nonbool_cond_vector_scalar_scalar
+// CHECK: [[TMP0:%.*]] = load <4 x i32>, ptr %cond0.addr, align 16
+// CHECK: [[TOBOOL:%.*]] = icmp ne <4 x i32> [[TMP0]], zeroinitializer
+// CHECK: [[SPLAT_SRC1:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT1:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC1]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK: [[SPLAT_SRC2:%.*]] = insertelement <4 x i32> poison, i32 {{%.*}}, i64 0
+// CHECK: [[SPLAT2:%.*]] = shufflevector <4 x i32> [[SPLAT_SRC2]], <4 x i32> poison, <4 x i32> zeroinitializer
+// CHECK: [[SELECT:%.*]] = select <4 x i1> [[TOBOOL]], <4 x i32> [[SPLAT1]], <4 x i32> [[SPLAT2]]
+// CHECK: ret <4 x i32> [[SELECT]]
+int4 test_select_nonbool_cond_vector_scalar_scalar(int4 cond0, int tVal, int fVal) {
+  return select(cond0, tVal, fVal);
+}
diff --git a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
index ea1f734391614..5cbf6452d4c85 100644
--- a/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
+++ b/clang/test/CodeGenOpenCL/cl20-device-side-enqueue-attributes.cl
@@ -199,7 +199,7 @@ kernel void device_side_enqueue(global float *a, global float *b, int i) {
 // SPIR32: attributes #[[ATTR0]] = { convergent noinline norecurse nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 // SPIR32: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 // SPIR32: attributes #[[ATTR2]] = { convergent noinline nounwind optnone "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+// SPIR32: attributes #[[ATTR3:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 // SPIR32: attributes #[[ATTR4]] = { convergent nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // SPIR32: attributes #[[ATTR5]] = { convergent nounwind "uniform-work-group-size"="true" }
 //.
diff --git a/clang/test/DebugInfo/KeyInstructions/flag.cpp b/clang/test/DebugInfo/KeyInstructions/flag.cpp
index 6aeeed664135e..4a4a5c4c142a7 100644
--- a/clang/test/DebugInfo/KeyInstructions/flag.cpp
+++ b/clang/test/DebugInfo/KeyInstructions/flag.cpp
@@ -8,6 +8,9 @@
 
 // KEY-INSTRUCTIONS: "-gkey-instructions"
 // NO-KEY-INSTRUCTIONS-NOT: key-instructions
+
+// Only expect one dwarf related flag.
+// NO-DEBUG: -fdwarf2-cfi-asm
 // NO-DEBUG-NOT: debug-info-kind
 // NO-DEBUG-NOT: dwarf
 
diff --git a/clang/test/Driver/hip-spirv-translator-new-driver.c b/clang/test/Driver/hip-spirv-translator-new-driver.c
new file mode 100644
index 0000000000000..67d894e2eb506
--- /dev/null
+++ b/clang/test/Driver/hip-spirv-translator-new-driver.c
@@ -0,0 +1,9 @@
+// The --offload-new-driver was crashing when using -save-temps due to a failure in clang-linker-wrapper.
+// The input and output files cannot be the same.
+
+// RUN: %clang --offload-new-driver -### -save-temps -nogpuinc -nogpulib \
+// RUN: --target=x86_64-unknown-linux-gnu --offload-arch=amdgcnspirv -x hip %s 2>&1 \
+// RUN: | FileCheck %s
+
+// CHECK-NOT: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" "[[OUTPUT_FILE:.*.o]]" {{.*}}"[[OUTPUT_FILE]]"
+// CHECK: {{".*clang-linker-wrapper.*"}} {{.*}} "-o" {{".*.tmp.o"}}
diff --git a/clang/test/Driver/mg.c b/clang/test/Driver/mg.c
index 82d8a6084e5e0..b7458a08698d3 100644
--- a/clang/test/Driver/mg.c
+++ b/clang/test/Driver/mg.c
@@ -1,5 +1,7 @@
-// RUN: %clang -M -MG -include nonexistent-preinclude.h %s | FileCheck %s
+// RUN: %clang -M -MG -include nonexistent-preinclude.h -std=c23 %s | FileCheck %s
 // CHECK: nonexistent-preinclude.h
 // CHECK: nonexistent-ppinclude.h
+// CHECK: nonexistent-embed
 
 #include "nonexistent-ppinclude.h"
+#embed "nonexistent-embed"
diff --git a/clang/test/Driver/no-gpu-bundle-respected.hip b/clang/test/Driver/no-gpu-bundle-respected.hip
new file mode 100644
index 0000000000000..fc93640dc4b90
--- /dev/null
+++ b/clang/test/Driver/no-gpu-bundle-respected.hip
@@ -0,0 +1,24 @@
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefix=BUNDLE
+
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefix=BUNDLE
+
+// RUN: %clang -ccc-print-phases -c -emit-llvm \
+// RUN:   --no-gpu-bundle-output --offload-arch=gfx900,gfx1030 -O3 -x hip %s \
+// RUN:   2>&1 | FileCheck %s --check-prefixes=COMPILER,GFX1030,GFX900,OFFLOAD,NOBUNDLE
+
+// BUNDLE: clang-offload-bundler
+// NOBUNDLE-NOT: clang-offload-bundler
+
+// COM: sanity checks
+// COMPILER: compiler
+// GFX1030: (device-hip, gfx1030)
+// GFX900: (device-hip, gfx900)
+// OFFLOAD: offload
+
+int square(int num) {
+    return num * num;
+}
diff --git a/clang/test/Frontend/diags-interesting-source-region-colors.cpp b/clang/test/Frontend/diags-interesting-source-region-colors.cpp
new file mode 100644
index 0000000000000..80db0873b9e0a
--- /dev/null
+++ b/clang/test/Frontend/diags-interesting-source-region-colors.cpp
@@ -0,0 +1,30 @@
+// RUN: not %clang_cc1 %s -fmessage-length=40 -fcolor-diagnostics -fno-show-source-location -Wunused-value -o - 2>&1 | FileCheck %s
+
+// REQUIRES: ansi-escape-sequences
+
+int main() {
+          1 +                                                        + if;
+  // CHECK: expected expression
+  // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]];
+
+    /*😂*/1 +                                                        + if;
+  // CHECK: expected expression
+  // CHECK-NEXT: ...+ [[MAGENTA:.\[0;34m]]if[[RESET:.\[0m]];
+
+  a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
+  // CHECK: use of undeclared identifier
+  // CHECK-NEXT: a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] + [[GREEN]]1[[RESET]] ...
+
+
+  /*😂😂😂*/ a + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1;
+  // CHECK: use of undeclared identifier
+  // CHECK-NEXT: [[YELLOW:.\[0;33m]]/*😂😂😂*/[[RESET]] a + [[GREEN:.\[0;32m]]1[[RESET]] + [[GREEN]]1[[RESET]] ...
+
+  "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+  // CHECK: [[GREEN:.\[0;32m]]"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]];
+
+  "😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
+  // CHECK: [[GREEN:.\[0;32m]]"😂xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"[[RESET]];
+}
+
+
diff --git a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
index 186965177caaf..8ab6ae4779ea9 100644
--- a/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
+++ b/clang/test/Modules/Inputs/builtin-headers/system-modules.modulemap
@@ -49,6 +49,11 @@ module cstd [system] [no_undeclared_includes] {
     export *
   }
 
+  module stdckdint {
+    header "stdckdint.h"
+    export *
+  }
+
   module stdcountof {
     header "stdcountof.h"
     export *
diff --git a/clang/test/Modules/builtin-headers.mm b/clang/test/Modules/builtin-headers.mm
index ad2d66ae38dfd..6cd366228172e 100644
--- a/clang/test/Modules/builtin-headers.mm
+++ b/clang/test/Modules/builtin-headers.mm
@@ -17,6 +17,7 @@
 @import _Builtin_stdarg;
 @import _Builtin_stdatomic;
 @import _Builtin_stdbool;
+@import _Builtin_stdckdint;
 @import _Builtin_stdcountof;
 @import _Builtin_stddef;
 @import _Builtin_stdint;
diff --git a/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm
new file mode 100644
index 0000000000000..90c57796dcf7e
--- /dev/null
+++ b/clang/test/Modules/crash-enum-visibility-with-header-unit.cppm
@@ -0,0 +1,46 @@
+// Fixes #165445
+
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -x c++-user-header %t/header.h \
+// RUN:   -emit-header-unit -o %t/header.pcm
+//
+// RUN: %clang_cc1 -std=c++20 %t/A.cppm -fmodule-file=%t/header.pcm \
+// RUN:   -emit-module-interface -o %t/A.pcm
+// 
+// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fmodule-file=%t/header.pcm \
+// RUN:   -emit-module-interface -o %t/B.pcm
+//
+// RUN: %clang_cc1 -std=c++20 %t/use.cpp \
+// RUN:   -fmodule-file=A=%t/A.pcm -fmodule-file=B=%t/B.pcm  \
+// RUN:   -fmodule-file=%t/header.pcm \
+// RUN:   -verify -fsyntax-only
+
+//--- enum.h
+enum E { Value };
+
+//--- header.h
+#include "enum.h"
+
+//--- A.cppm
+module;
+#include "enum.h"
+export module A;
+
+auto e = Value;
+
+//--- B.cppm
+export module B;
+import "header.h";
+
+auto e = Value;
+
+//--- use.cpp
+// expected-no-diagnostics
+import A;
+import B;
+#include "enum.h"
+
+auto e = Value;
diff --git a/clang/test/OpenMP/metadirective_ast_print.c b/clang/test/OpenMP/metadirective_ast_print.c
index 638dbae1bc774..75ef5fa26827c 100644
--- a/clang/test/OpenMP/metadirective_ast_print.c
+++ b/clang/test/OpenMP/metadirective_ast_print.c
@@ -2,17 +2,25 @@
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT
 
-// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=DEFAULT-GPU
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple x86_64-unknown-linux-gnu -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52
 
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
+
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
 
-// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple amdgcn-amd-amdhsa -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-AMDGCN
+// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=52 -DOMP52 -triple spirv64-intel -x c -std=c99 -ast-print %s -o - | FileCheck %s --check-prefix=OMP52-GPU
 // expected-no-diagnostics
 
 #ifndef HEADER
@@ -77,6 +85,12 @@ void foo1(void) {
   for (int i = 0; i < 100; i++)
   ;
 
+#pragma omp metadirective when(device={arch("spirv64")}: \
+                                teams distribute parallel for)\
+                                otherwise(parallel for)
+  for (int i = 0; i < 100; i++)
+  ;
+
 #pragma omp metadirective when(implementation = {extension(match_all)} \
                                : nothing) otherwise(parallel for)
   for (int i = 0; i < 16; i++)
@@ -134,8 +148,8 @@ void foo1(void) {
 // OMP52-NEXT: for (int i = 0; i < 16; i++) {
 // OMP52-NEXT: #pragma omp simd
 // OMP52-NEXT: for (int j = 0; j < 16; j++)
-// OMP52-AMDGCN: #pragma omp teams distribute parallel for
-// OMP52-AMDGCN-NEXT: for (int i = 0; i < 100; i++)
+// OMP52-GPU: #pragma omp teams distribute parallel for
+// OMP52-GPU-NEXT: for (int i = 0; i < 100; i++)
 // OMP52: for (int i = 0; i < 16; i++)
 // OMP52: for (int i = 0; i < 16; i++)
 
@@ -198,6 +212,12 @@ void foo2(void) {
   for (int i = 0; i < 100; i++)
   ;
 
+#pragma omp metadirective when(device={arch("spirv64")}: \
+                                teams distribute parallel for)\
+                                default(parallel for)
+  for (int i = 0; i < 100; i++)
+  ;  
+
 #pragma omp metadirective when(implementation = {extension(match_all)} \
                                : nothing) default(parallel for)
   for (int i = 0; i < 16; i++)
@@ -266,8 +286,8 @@ void foo2(void) {
 // DEFAULT-NEXT: for (int i = 0; i < 16; i++) {
 // DEFAULT-NEXT: #pragma omp simd
 // DEFAULT-NEXT: for (int j = 0; j < 16; j++)
-// DEFAULT-AMDGCN: #pragma omp teams distribute parallel for
-// DEFAULT-AMDGCN-NEXT: for (int i = 0; i < 100; i++)
+// DEFAULT-GPU: #pragma omp teams distribute parallel for
+// DEFAULT-GPU-NEXT: for (int i = 0; i < 100; i++)
 // DEFAULT: for (int i = 0; i < 16; i++)
 // DEFAULT: for (int i = 0; i < 16; i++)
 
diff --git a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
index eecae310d0a77..1d5584de67162 100644
--- a/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
+++ b/clang/test/OpenMP/metadirective_device_arch_codegen.cpp
@@ -1,7 +1,7 @@
-// REQUIRES: amdgpu-registered-target
-
 // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -target-cpu gfx906 -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-ppc-spirv-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -w -std=c++11 -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-spirv-host.bc  -o - | FileCheck %s
 // expected-no-diagnostics
 
 
@@ -16,6 +16,12 @@ Inspired from SOLLVE tests:
 
 #define N 1024
 
+#ifdef __AMDGPU__
+#define GPU "amdgcn"
+#else
+#define GPU "spirv64"
+#endif
+
 int metadirective1() {
 
    int v1[N], v2[N], v3[N];
@@ -26,7 +32,7 @@ int metadirective1() {
    #pragma omp target map(to:v1,v2) map(from:v3, target_device_num) device(default_device)
    {
       #pragma omp metadirective \
-                   when(device={arch("amdgcn")}: teams distribute parallel for) \
+                   when(device={arch(GPU)}: teams distribute parallel for) \
                    default(parallel for)
 
          for (int i = 0; i < N; i++) {
@@ -38,28 +44,28 @@ int metadirective1() {
    return errors;
 }
 
-// CHECK: define weak_odr protected amdgpu_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]]
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @[[METADIRECTIVE:.+metadirective1[a-z0-9_]+]]
 // CHECK: entry:
-// CHECK: %{{[0-9]}} = call i32 @__kmpc_target_init
+// CHECK: %{{[0-9]}} = call{{.*}} i32 @__kmpc_target_init
 // CHECK: user_code.entry:
-// CHECK: call void @[[METADIRECTIVE]]_omp_outlined
-// CHECK-NOT: call void @__kmpc_parallel_51
+// CHECK: call{{.*}} void @[[METADIRECTIVE]]_omp_outlined
+// CHECK-NOT: call{{.*}} void @__kmpc_parallel_51
 // CHECK: ret void
 
 
 // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined
 // CHECK: entry:
-// CHECK: call void @__kmpc_distribute_static_init
+// CHECK: call{{.*}} void @__kmpc_distribute_static_init
 // CHECK: omp.loop.exit:
-// CHECK: call void @__kmpc_distribute_static_fini
+// CHECK: call{{.*}} void @__kmpc_distribute_static_fini
 
 
 // CHECK: define internal void @[[METADIRECTIVE]]_omp_outlined_omp_outlined
 // CHECK: entry:
-// CHECK: call void @__kmpc_for_static_init_4
+// CHECK: call{{.*}} void @__kmpc_for_static_init_4
 // CHECK: omp.inner.for.body:
 // CHECK: store atomic {{.*}} monotonic
 // CHECK: omp.loop.exit:
-// CHECK-NEXT: call void @__kmpc_for_static_fini
+// CHECK-NEXT: call{{.*}} void @__kmpc_for_static_fini
 // CHECK-NEXT: ret void
 
diff --git a/clang/test/OpenMP/thread_limit_amdgpu.c b/clang/test/OpenMP/thread_limit_amdgpu.c
deleted file mode 100644
index f884eeb73c3ff..0000000000000
--- a/clang/test/OpenMP/thread_limit_amdgpu.c
+++ /dev/null
@@ -1,34 +0,0 @@
-// Test target codegen - host bc file has to be created first.
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
-// expected-no-diagnostics
-
-#ifndef HEADER
-#define HEADER
-
-void foo(int N) {
-#pragma omp target teams distribute parallel for simd
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd thread_limit(4)
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
-  for (int i = 0; i < N; ++i)
-    ;
-#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
-  for (int i = 0; i < N; ++i)
-    ;
-}
-
-#endif
-
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l10({{.*}}) #[[ATTR1:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l13({{.*}}) #[[ATTR2:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l16({{.*}}) #[[ATTR3:.+]] {
-// CHECK: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l19({{.*}}) #[[ATTR4:.+]] {
-
-// CHECK: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
-// CHECK: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
-// CHECK: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
-// CHECK: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
diff --git a/clang/test/OpenMP/thread_limit_gpu.c b/clang/test/OpenMP/thread_limit_gpu.c
new file mode 100644
index 0000000000000..4bcc14d070c22
--- /dev/null
+++ b/clang/test/OpenMP/thread_limit_gpu.c
@@ -0,0 +1,41 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-AMDGPU %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-unknown-linux-gnu -fopenmp-targets=spirv64-intel -emit-llvm-bc %s -o %t-x86-spirv-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple spirv64-intel -fopenmp-targets=spirv64-intel -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-spirv-host.bc -o - | FileCheck -check-prefixes=CHECK,CHECK-SPIRV %s
+// expected-no-diagnostics
+
+#ifndef HEADER
+#define HEADER
+
+void foo(int N) {
+#pragma omp target teams distribute parallel for simd
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd thread_limit(4)
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
+  for (int i = 0; i < N; ++i)
+    ;
+}
+
+#endif
+
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l12({{.*}}) #[[ATTR1:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] {
+
+// CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" "amdgpu-max-num-workgroups"="42,1,1"{{.*}} }
+
+// CHECK-SPIRV: attributes #[[ATTR1]] = { {{.*}} "omp_target_thread_limit"="256" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR2]] = { {{.*}} "omp_target_thread_limit"="4"  {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="42" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="22" {{.*}} }
diff --git a/clang/test/Parser/ms-empty-enum.c b/clang/test/Parser/ms-empty-enum.c
new file mode 100644
index 0000000000000..790547af88bab
--- /dev/null
+++ b/clang/test/Parser/ms-empty-enum.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 %s -fsyntax-only -Wmicrosoft -verify -fms-extensions
+
+typedef enum tag1 { } A; // expected-warning {{empty enumeration types are a Microsoft extension}}
+typedef enum tag2 { } B; // expected-warning {{empty enumeration types are a Microsoft extension}}
+typedef enum : unsigned { } C; // expected-warning {{enumeration types with a fixed underlying type are a Microsoft extension}}\
+                               // expected-warning {{empty enumeration types are a Microsoft extension}}
diff --git a/clang/test/Preprocessor/unwind-tables.c b/clang/test/Preprocessor/unwind-tables.c
index 0a863d79adbf6..5ff990d0c40a6 100644
--- a/clang/test/Preprocessor/unwind-tables.c
+++ b/clang/test/Preprocessor/unwind-tables.c
@@ -1,11 +1,13 @@
 // RUN: %clang %s -dM -E -target x86_64-windows | FileCheck %s --check-prefix=NO
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables | FileCheck %s --check-prefix=NO
+// RUN: %clang %s -dM -E -target x86_64 -fno-dwarf2-cfi-asm | FileCheck %s --check-prefix=NO
 
 // RUN: %clang %s -dM -E -target x86_64 | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -funwind-tables -fno-asynchronous-unwind-tables -g | FileCheck %s
 // RUN: %clang %s -dM -E -target aarch64-apple-darwin | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -g | FileCheck %s
 // RUN: %clang %s -dM -E -target x86_64 -fno-asynchronous-unwind-tables -fexceptions | FileCheck %s
+// RUN: %clang %s -dM -E -target x86_64-windows -fdwarf2-cfi-asm | FileCheck %s
 
 // NO-NOT: #define __GCC_HAVE_DWARF2_CFI_ASM
 // CHECK: #define __GCC_HAVE_DWARF2_CFI_ASM 1
diff --git a/clang/test/Sema/attr-nonblocking-constraints.cpp b/clang/test/Sema/attr-nonblocking-constraints.cpp
index b26a945843696..881e816292d59 100644
--- a/clang/test/Sema/attr-nonblocking-constraints.cpp
+++ b/clang/test/Sema/attr-nonblocking-constraints.cpp
@@ -235,16 +235,35 @@ void nb13() [[clang::nonblocking]] { nb12(); }
 // C++ member function pointers
 struct PTMFTester {
 	typedef void (PTMFTester::*ConvertFunction)() [[clang::nonblocking]];
-
-	void convert() [[clang::nonblocking]];
+	typedef void (PTMFTester::*BlockingFunction)();
 
 	ConvertFunction mConvertFunc;
-};
 
-void PTMFTester::convert() [[clang::nonblocking]]
-{
-	(this->*mConvertFunc)();
-}
+	void convert() [[clang::nonblocking]]
+	{
+		(this->*mConvertFunc)(); // This should not generate a warning.
+	}
+
+	template <typename T>
+	struct Holder {
+		T value;
+		
+		T& operator*() { return value; }
+	};
+
+
+	void ptmfInExpr(Holder<ConvertFunction>& holder) [[clang::nonblocking]]
+	{
+		(this->*(*holder))();   // Should not generate a warning.
+		((*this).*(*holder))(); // Should not generate a warning.
+	}
+
+	void ptmfInExpr(Holder<BlockingFunction>& holder) [[clang::nonblocking]]
+	{
+		(this->*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}}
+		((*this).*(*holder))(); // expected-warning {{function with 'nonblocking' attribute must not call non-'nonblocking' expression}}
+	}
+};
 
 // Allow implicit conversion from array to pointer.
 void nb14(unsigned idx) [[clang::nonblocking]]
@@ -354,6 +373,33 @@ struct Unsafe {
   Unsafe(float y) [[clang::nonblocking]] : Unsafe(int(y)) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
 };
 
+// Exercise cases of a temporary with a safe constructor and unsafe destructor.
+void nb23()
+{
+	struct X {
+		int *ptr = nullptr;
+		X() {}
+		~X() { delete ptr; } // expected-note 2 {{destructor cannot be inferred 'nonblocking' because it allocates or deallocates memory}}
+	};
+
+	auto inner = []() [[clang::nonblocking]] {
+		X(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}}
+	};
+
+	auto inner2 = [](X x) [[clang::nonblocking]] { // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor 'nb23()::X::~X'}}
+	};
+
+}
+
+struct S2 { ~S2(); }; // expected-note 2 {{declaration cannot be inferred 'nonblocking' because it has no definition in this translation unit}}
+void nb24() {
+    S2 s;
+    [&]() [[clang::nonblocking]] {
+        [s]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}}
+        [=]{ auto x = &s; }(); // expected-warning {{lambda with 'nonblocking' attribute must not call non-'nonblocking' destructor}} expected-note {{destructor cannot be inferred 'nonblocking' because it calls non-'nonblocking' destructor 'S2::~S2'}}
+    }();
+}
+
 struct DerivedFromUnsafe : public Unsafe {
   DerivedFromUnsafe() [[clang::nonblocking]] {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
   DerivedFromUnsafe(int x) [[clang::nonblocking]] : Unsafe(x) {} // expected-warning {{constructor with 'nonblocking' attribute must not call non-'nonblocking' constructor 'Unsafe::Unsafe'}}
diff --git a/clang/test/Sema/labeled-break-continue.c b/clang/test/Sema/labeled-break-continue.c
index 78f81c484c3d5..6b4adc23dca8d 100644
--- a/clang/test/Sema/labeled-break-continue.c
+++ b/clang/test/Sema/labeled-break-continue.c
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -std=c2y -verify -fsyntax-only -fblocks %s
-// RUN: %clang_cc1 -std=c23 -verify -fsyntax-only -fblocks -fnamed-loops %s
-// RUN: %clang_cc1 -x c++ -verify -fsyntax-only -fblocks -fnamed-loops %s
+// RUN: %clang_cc1 -std=c2y -verify -Wunused -fsyntax-only -fblocks %s
+// RUN: %clang_cc1 -std=c23 -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s
+// RUN: %clang_cc1 -x c++ -verify -Wunused -fsyntax-only -fblocks -fnamed-loops %s
 
 void f1() {
   l1: while (true) {
@@ -159,3 +159,15 @@ void f7() {
     continue d; // expected-error {{'continue' label does not name an enclosing loop}}
   }
 }
+
+void f8() {
+  l1: // no-warning
+  while (true) {
+    break l1;
+  }
+
+  l2: // no-warning
+  while (true) {
+    continue l2;
+  }
+}
diff --git a/clang/test/SemaCXX/constant-expression-cxx14.cpp b/clang/test/SemaCXX/constant-expression-cxx14.cpp
index bea90ff7eaf8a..1fc6e5ec4cc55 100644
--- a/clang/test/SemaCXX/constant-expression-cxx14.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx14.cpp
@@ -1450,3 +1450,9 @@ namespace GH149500 {
   unsigned int * p = &(*(unsigned int *)0x400);
   static const void *q = &(*(const struct sysrq_key_op *)0);
 }
+
+constexpr bool missingCase() {
+  switch (1) {
+    1u: return false; // expected-error {{expected 'case' keyword before expression}}
+  }
+}
diff --git a/clang/test/SemaCXX/vector.cpp b/clang/test/SemaCXX/vector.cpp
index 808bdb679b09c..06195f039cd92 100644
--- a/clang/test/SemaCXX/vector.cpp
+++ b/clang/test/SemaCXX/vector.cpp
@@ -786,3 +786,16 @@ const long long e = *0; // expected-error {{indirection requires pointer operand
 double f = a - e;       // expected-error {{cannot initialize a variable of type 'double' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(double)))) double' (vector of 1 'double' value)}}
 int h = c - e;          // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type '__attribute__((__vector_size__(1 * sizeof(long)))) long' (vector of 1 'long' value)}}
 }
+
+typedef int v_neg_size __attribute__((vector_size(-8))); // expected-error{{vector must have non-negative size}}
+typedef int v_neg_size_2 __attribute__((vector_size(-1 * 8))); // expected-error{{vector must have non-negative size}}
+typedef int v_ext_neg_size __attribute__((ext_vector_type(-8))); // expected-error{{vector must have non-negative size}}
+typedef int v_ext_neg_size2 __attribute__((ext_vector_type(-1 * 8))); // expected-error{{vector must have non-negative size}}
+
+
+#if __cplusplus >= 201103L
+
+template <int N> using templated_v_size = int  __attribute__((vector_size(N))); // expected-error{{vector must have non-negative size}}
+templated_v_size<-8> templated_v_neg_size; //expected-note{{in instantiation of template type alias 'templated_v_size' requested here}}
+
+#endif
diff --git a/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
new file mode 100644
index 0000000000000..8f2f9308ed966
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/f16tof32-errors.hlsl
@@ -0,0 +1,134 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple dxil-pc-shadermodel6.6-library %s -fnative-int16-type -emit-llvm-only -disable-llvm-passes -verify
+
+float builtin_f16tof32_too_few_arg() {
+  return __builtin_hlsl_elementwise_f16tof32();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+  // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 0 were provided}}
+}
+
+float builtin_f16tof32_too_many_arg(uint p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+  // expected-note@hlsl/hlsl_alias_intrinsics.h:* 4 {{candidate function not viable: requires 1 argument, but 2 were provided}}
+}
+
+float builtin_f16tof32_bool(bool p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}}
+}
+
+float builtin_f16tof32_bool4(bool4 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool4' (aka 'vector<bool, 4>')}}
+}
+
+float builtin_f16tof32_short(short p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}}
+}
+
+float builtin_f16tof32_unsigned_short(unsigned short p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
+}
+
+float builtin_f16tof32_int(int p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}}
+}
+
+float builtin_f16tof32_int64_t(long p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}}
+}
+
+float2 builtin_f16tof32_int2_to_float2_promotion(int2 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int2' (aka 'vector<int, 2>'))}}
+}
+
+float builtin_f16tof32_half(half p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}}
+}
+
+float builtin_f16tof32_half4(half4 p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half4' (aka 'vector<half, 4>'))}}
+}
+
+float builtin_f16tof32_float(float p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}}
+}
+
+float builtin_f16tof32_double(double p0) {
+  return __builtin_hlsl_elementwise_f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}}
+}
+
+float f16tof32_too_few_arg() {
+  return f16tof32();
+  // expected-error@-1 {{no matching function for call to 'f16tof32'}}
+}
+
+float f16tof32_too_many_arg(uint p0) {
+  return f16tof32(p0, p0);
+  // expected-error@-1 {{no matching function for call to 'f16tof32'}}
+}
+
+float f16tof32_bool(bool p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool')}}
+}
+
+float f16tof32_bool3(bool3 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'bool3' (aka 'vector<bool, 3>'))}}
+}
+
+
+float f16tof32_int16_t(short p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'short')}}
+}
+
+float f16tof32_int16_t(unsigned short p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{incorrect number of bits in integer (expected 32 bits, have 16)}}
+}
+
+float f16tof32_int(int p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int')}}
+}
+
+float f16tof32_int64_t(long p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'long')}}
+}
+
+float2 f16tof32_int2_to_float2_promotion(int3 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'int3' (aka 'vector<int, 3>'))}}
+}
+
+float f16tof32_half(half p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half')}}
+}
+
+float f16tof32_half2(half2 p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'half2' (aka 'vector<half, 2>'))}}
+}
+
+float f16tof32_float(float p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'float')}}
+}
+
+float f16tof32_double(double p0) {
+  return f16tof32(p0);
+  // expected-error@-1 {{1st argument must be a scalar or vector of unsigned integer types (was 'double')}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
index 12c818acec035..b2f45051a9bd8 100644
--- a/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/select-errors.hlsl
@@ -15,7 +15,7 @@ int2 test_select_vector_vals_not_vecs(bool2 p0, int t0,
 }
 
 int1 test_select_vector_vals_wrong_size(bool2 p0, int1 t0, int1 f0) {
-  return select<int,1>(p0, t0, f0); // expected-warning{{implicit conversion truncates vector: 'bool2' (aka 'vector<bool, 2>') to 'vector<bool, 1>' (vector of 1 'bool' value)}}
+  return select<int1>(p0, t0, f0); // No diagnostic expected.
 }
 
 int test_select_no_args() {
diff --git a/clang/test/SemaTemplate/ctad.cpp b/clang/test/SemaTemplate/ctad.cpp
index 1a575ea527006..60603f0c963a5 100644
--- a/clang/test/SemaTemplate/ctad.cpp
+++ b/clang/test/SemaTemplate/ctad.cpp
@@ -104,3 +104,15 @@ namespace ConvertDeducedTemplateArgument {
 
   auto x = C(D<A::B>());
 }
+
+namespace pr165560 {
+template <class T, class> struct S {
+  using A = T;
+  template <class> struct I { // expected-note{{candidate function template not viable: requires 1 argument, but 0 were provided}} \
+                              // expected-note{{implicit deduction guide declared as 'template <class> I(pr165560::S<int, int>::I<type-parameter-0-0>) -> pr165560::S<int, int>::I<type-parameter-0-0>'}}
+    I(typename A::F) {} // expected-error{{type 'A' (aka 'int') cannot be used prior to '::' because it has no members}}
+  };
+};
+S<int, int>::I i; // expected-error{{no viable constructor or deduction guide for deduction of template arguments of 'S<int, int>::I'}} \
+                  // expected-note{{while building implicit deduction guide first needed here}}
+}
diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
index ef229606de0f0..8fc9a66dbda7e 100644
--- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
+++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
@@ -2076,4 +2076,19 @@ TEST(ExprMutationAnalyzerTest, PointeeMutatedByReturn) {
   }
 }
 
+TEST(ExprMutationAnalyzerTest, PointeeMutatedByPointerToMemberOperator) {
+  // GH161913
+  const std::string Code = R"(
+    struct S { int i; };
+    void f(S s) {
+      S *x = &s;
+      (x->*(&S::i))++;
+    }
+  )";
+  auto AST = buildASTFromCodeWithArgs(Code, {"-Wno-everything"});
+  auto Results =
+      match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_TRUE(isPointeeMutated(Results, AST.get()));
+}
+
 } // namespace clang
diff --git a/clang/unittests/CodeGen/CMakeLists.txt b/clang/unittests/CodeGen/CMakeLists.txt
index f5bcecb0b08a3..d4efb2230a054 100644
--- a/clang/unittests/CodeGen/CMakeLists.txt
+++ b/clang/unittests/CodeGen/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_clang_unittest(ClangCodeGenTests
   BufferSourceTest.cpp
   CodeGenExternalTest.cpp
+  DemangleTrapReasonInDebugInfo.cpp
   TBAAMetadataTest.cpp
   CheckTargetFeaturesTest.cpp
   CLANG_LIBS
diff --git a/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp
new file mode 100644
index 0000000000000..17bfe17c31d65
--- /dev/null
+++ b/clang/unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp
@@ -0,0 +1,67 @@
+//=== unittests/CodeGen/DemangleTrapReasonInDebugInfo.cpp -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/CodeGen/ModuleBuilder.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+
+using namespace clang::CodeGen;
+
+void CheckValidCommon(llvm::StringRef FuncName, const char *ExpectedCategory,
+                      const char *ExpectedMessage) {
+  auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName);
+  ASSERT_TRUE(MaybeTrapReason.has_value());
+  auto [Category, Message] = MaybeTrapReason.value();
+  ASSERT_STREQ(Category.str().c_str(), ExpectedCategory);
+  ASSERT_STREQ(Message.str().c_str(), ExpectedMessage);
+}
+
+void CheckInvalidCommon(llvm::StringRef FuncName) {
+  auto MaybeTrapReason = DemangleTrapReasonInDebugInfo(FuncName);
+  ASSERT_TRUE(!MaybeTrapReason.has_value());
+}
+
+TEST(DemangleTrapReasonInDebugInfo, Valid) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$trap category$trap message";
+  CheckValidCommon(FuncName, "trap category", "trap message");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidEmptyCategory) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$$trap message";
+  CheckValidCommon(FuncName, "", "trap message");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidEmptyMessage) {
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$trap category$";
+  CheckValidCommon(FuncName, "trap category", "");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, ValidAllEmpty) {
+  //  `__builtin_verbose_trap` actually allows this
+  // currently. However, we should probably disallow this in Sema because having
+  // an empty category and message completely defeats the point of using the
+  // builtin (#165981).
+  std::string FuncName(ClangTrapPrefix);
+  FuncName += "$$";
+  CheckValidCommon(FuncName, "", "");
+}
+
+TEST(DemangleTrapReasonInDebugInfo, InvalidOnlyPrefix) {
+  std::string FuncName(ClangTrapPrefix);
+  CheckInvalidCommon(FuncName);
+}
+
+TEST(DemangleTrapReasonInDebugInfo, Invalid) {
+  std::string FuncName("foo");
+  CheckInvalidCommon(FuncName);
+}
+
+TEST(DemangleTrapReasonInDebugInfo, InvalidEmpty) { CheckInvalidCommon(""); }
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index ca9e7925e5e95..24235b966399d 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -20824,6 +20824,13 @@ TEST_F(FormatTest, AlignWithLineBreaks) {
                "    argument1,\n"
                "    argument2);",
                Style);
+
+  Style.ColumnLimit = 45;
+  verifyFormat("auto xxxxxxxx = foo;\n"
+               "auto x = whatever ? some / long -\n"
+               "                        computition / stuff\n"
+               "                  : random;",
+               Style);
 }
 
 TEST_F(FormatTest, AlignWithInitializerPeriods) {
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 0312c9dfc0665..e9fadb2dbd4ac 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -40,2843 +40,3314 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
 
 <p>This page tracks which C++ defect reports are implemented within Clang.</p>
 
-<table width="689" border="1" cellspacing="0">
+<table width="892" border="1" cellspacing="0">
   <tr>
     <th>Number</th>
+    <th>Section</th>
     <th>Status</th>
     <th>Issue title</th>
     <th>Available in Clang?</th>
   </tr>
   <tr id="1">
     <td><a href="https://cplusplus.github.io/CWG/issues/1.html">1</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>What if two using-declarations refer to the same function but the declarations introduce different default-arguments?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2">
     <td><a href="https://cplusplus.github.io/CWG/issues/2.html">2</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>drafting</td>
     <td>How can dependent names be used in member declarations that appear outside of the class template definition?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3">
     <td><a href="https://cplusplus.github.io/CWG/issues/3.html">3</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>The template compilation model rules render some explicit specialization declarations not visible during instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="4">
     <td><a href="https://cplusplus.github.io/CWG/issues/4.html">4</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD1</td>
     <td>Does extern "C" affect the linkage of function names with internal linkage?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="5">
     <td><a href="https://cplusplus.github.io/CWG/issues/5.html">5</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>CV-qualifiers and type conversions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="6">
     <td><a href="https://cplusplus.github.io/CWG/issues/6.html">6</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>NAD</td>
     <td>Should the optimization that allows a class object to alias another object also allow the case of a parameter in an inline function to alias its argument?</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="7">
     <td><a href="https://cplusplus.github.io/CWG/issues/7.html">7</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Can a class with a private virtual base class be derived from?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="8">
     <td><a href="https://cplusplus.github.io/CWG/issues/8.html">8</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD1</td>
     <td>Access to template arguments used in a function return type and in the nested name specifier</td>
     <td class="full" align="center">Duplicate of <a href="#45">45</a></td>
   </tr>
   <tr id="9">
     <td><a href="https://cplusplus.github.io/CWG/issues/9.html">9</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td>Clarification of access to base class members</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="10">
     <td><a href="https://cplusplus.github.io/CWG/issues/10.html">10</a></td>
+    <td>[<a href="https://wg21.link/class.access.nest">class.access.nest</a>]</td>
     <td>CD1</td>
     <td>Can a nested class access its own class name as a qualified name if it is a private member of the enclosing class?</td>
     <td class="full" align="center">Duplicate of <a href="#45">45</a></td>
   </tr>
   <tr id="11">
     <td><a href="https://cplusplus.github.io/CWG/issues/11.html">11</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td>How do the keywords typename/template interact with using-declarations?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="12">
     <td><a href="https://cplusplus.github.io/CWG/issues/12.html">12</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>dup</td>
     <td>Default arguments on different declarations for the same function and the Koenig lookup</td>
     <td class="full-superseded" align="center">Superseded by <a href="#239">239</a></td>
   </tr>
   <tr id="13">
     <td><a href="https://cplusplus.github.io/CWG/issues/13.html">13</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>extern "C" for Parameters of Function Templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="14">
     <td><a href="https://cplusplus.github.io/CWG/issues/14.html">14</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>extern "C" functions and declarations in different namespaces</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="15">
     <td><a href="https://cplusplus.github.io/CWG/issues/15.html">15</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>dup</td>
     <td>Default arguments for parameters of function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="16">
     <td><a href="https://cplusplus.github.io/CWG/issues/16.html">16</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td>Access to members of indirect private base classes</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="17">
     <td><a href="https://cplusplus.github.io/CWG/issues/17.html">17</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Footnote 99 should discuss the naming class when describing members that can be accessed from friends</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="18">
     <td><a href="https://cplusplus.github.io/CWG/issues/18.html">18</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>NAD</td>
     <td>f(TYPE) where TYPE is void should be allowed</td>
     <td class="full-superseded" align="center">Superseded by <a href="#577">577</a></td>
   </tr>
   <tr id="19">
     <td><a href="https://cplusplus.github.io/CWG/issues/19.html">19</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>NAD</td>
     <td>Clarify protected member access</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="20">
     <td><a href="https://cplusplus.github.io/CWG/issues/20.html">20</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>TC1</td>
     <td>Some clarifications needed for 12.8 para 15</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="21">
     <td><a href="https://cplusplus.github.io/CWG/issues/21.html">21</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Can a default argument for a template parameter appear in a friend declaration?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="22">
     <td><a href="https://cplusplus.github.io/CWG/issues/22.html">22</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>TC1</td>
     <td>Template parameter with a default argument that refers to itself</td>
     <td class="full-superseded" align="center">Superseded by <a href="#481">481</a></td>
   </tr>
   <tr id="23">
     <td><a href="https://cplusplus.github.io/CWG/issues/23.html">23</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>NAD</td>
     <td>Some questions regarding partial ordering of function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="24">
     <td><a href="https://cplusplus.github.io/CWG/issues/24.html">24</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>TC1</td>
     <td>Errors in examples in 14.7.3</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="25">
     <td><a href="https://cplusplus.github.io/CWG/issues/25.html">25</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>TC1</td>
     <td>Exception specifications and pointers to members</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="26">
     <td><a href="https://cplusplus.github.io/CWG/issues/26.html">26</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Copy constructors and default arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="27">
     <td><a href="https://cplusplus.github.io/CWG/issues/27.html">27</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>NAD</td>
     <td>Overload ambiguities for builtin ?: prototypes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="28">
     <td><a href="https://cplusplus.github.io/CWG/issues/28.html">28</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD1</td>
     <td>'exit', 'signal' and static object destruction</td>
     <td class="na" align="center">N/A (Library DR)</td>
   </tr>
   <tr id="29">
     <td><a href="https://cplusplus.github.io/CWG/issues/29.html">29</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD1</td>
     <td>Linkage of locally declared functions</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="30">
     <td><a href="https://cplusplus.github.io/CWG/issues/30.html">30</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>TC1</td>
     <td>Valid uses of "<TT>::template</TT>"</td>
     <td class="full-superseded" align="center">Superseded by <a href="#468">468</a> (C++11 onwards)</td>
   </tr>
   <tr id="31">
     <td><a href="https://cplusplus.github.io/CWG/issues/31.html">31</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Looking up new/delete</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="32">
     <td><a href="https://cplusplus.github.io/CWG/issues/32.html">32</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Clarification of explicit instantiation of non-exported templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="33">
     <td><a href="https://cplusplus.github.io/CWG/issues/33.html">33</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Argument dependent lookup and overloaded functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="34">
     <td><a href="https://cplusplus.github.io/CWG/issues/34.html">34</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Argument dependent lookup and points of instantiation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="35">
     <td><a href="https://cplusplus.github.io/CWG/issues/35.html">35</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Definition of default-initialization</td>
     <td class="full" align="center">Duplicate of <a href="#178">178</a></td>
   </tr>
   <tr id="36">
     <td><a href="https://cplusplus.github.io/CWG/issues/36.html">36</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s in multiple-declaration contexts</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="37">
     <td><a href="https://cplusplus.github.io/CWG/issues/37.html">37</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>NAD</td>
     <td>When is uncaught_exception() true?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#475">475</a></td>
   </tr>
   <tr id="38">
     <td><a href="https://cplusplus.github.io/CWG/issues/38.html">38</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>TC1</td>
     <td>Explicit template arguments and operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="39">
     <td><a href="https://cplusplus.github.io/CWG/issues/39.html">39</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD1</td>
     <td>Conflicting ambiguity rules</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="40">
     <td><a href="https://cplusplus.github.io/CWG/issues/40.html">40</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>TC1</td>
     <td>Syntax of <I>declarator-id</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="41">
     <td><a href="https://cplusplus.github.io/CWG/issues/41.html">41</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>TC1</td>
     <td>Clarification of lookup of names after declarator-id</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="42">
     <td><a href="https://cplusplus.github.io/CWG/issues/42.html">42</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>NAD</td>
     <td>Redefining names from base classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="43">
     <td><a href="https://cplusplus.github.io/CWG/issues/43.html">43</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>TC1</td>
     <td>Copying base classes (PODs) using memcpy</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="44">
     <td><a href="https://cplusplus.github.io/CWG/issues/44.html">44</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Member specializations</td>
     <td class="partial-superseded" align="center">Superseded by <a href="#727">727</a></td>
   </tr>
   <tr id="45">
     <td><a href="https://cplusplus.github.io/CWG/issues/45.html">45</a></td>
+    <td>[<a href="https://wg21.link/class.access.nest">class.access.nest</a>]</td>
     <td>CD1</td>
     <td>Access to nested classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="46">
     <td><a href="https://cplusplus.github.io/CWG/issues/46.html">46</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicit instantiation of member templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="47">
     <td><a href="https://cplusplus.github.io/CWG/issues/47.html">47</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td>Template friend issues</td>
     <td class="full-superseded" align="center">Superseded by <a href="#329">329</a></td>
   </tr>
   <tr id="48">
     <td><a href="https://cplusplus.github.io/CWG/issues/48.html">48</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>TC1</td>
     <td>Definitions of unused static members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="49">
     <td><a href="https://cplusplus.github.io/CWG/issues/49.html">49</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Restriction on non-type, non-value template arguments</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="50">
     <td><a href="https://cplusplus.github.io/CWG/issues/50.html">50</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Converting pointer to incomplete type to same type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="51">
     <td><a href="https://cplusplus.github.io/CWG/issues/51.html">51</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>TC1</td>
     <td>Overloading and user-defined conversions</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="52">
     <td><a href="https://cplusplus.github.io/CWG/issues/52.html">52</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>TC1</td>
     <td>Non-static members, member selection and access checking</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="53">
     <td><a href="https://cplusplus.github.io/CWG/issues/53.html">53</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td>Lvalue-to-rvalue conversion before certain static_casts</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="54">
     <td><a href="https://cplusplus.github.io/CWG/issues/54.html">54</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Static_cast from private base to derived class</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="55">
     <td><a href="https://cplusplus.github.io/CWG/issues/55.html">55</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Adding/subtracting pointer and enumeration value</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="56">
     <td><a href="https://cplusplus.github.io/CWG/issues/56.html">56</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>TC1</td>
     <td>Redeclaring typedefs within classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="57">
     <td><a href="https://cplusplus.github.io/CWG/issues/57.html">57</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>open</td>
     <td>Empty unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="58">
     <td><a href="https://cplusplus.github.io/CWG/issues/58.html">58</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD1</td>
     <td>Signedness of bit fields of enum type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="59">
     <td><a href="https://cplusplus.github.io/CWG/issues/59.html">59</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>TC1</td>
     <td>Clarification of overloading and UDC to reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="60">
     <td><a href="https://cplusplus.github.io/CWG/issues/60.html">60</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD1</td>
     <td>Reference binding and valid conversion sequences</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="61">
     <td><a href="https://cplusplus.github.io/CWG/issues/61.html">61</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>NAD</td>
     <td>Address of static member function "<TT>&amp;p-&gt;f</TT>"</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="62">
     <td><a href="https://cplusplus.github.io/CWG/issues/62.html">62</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.type">temp.arg.type</a>]</td>
     <td>CD1</td>
     <td>Unnamed members of classes used as type parameters</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="63">
     <td><a href="https://cplusplus.github.io/CWG/issues/63.html">63</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD1</td>
     <td>Class instantiation from pointer conversion to void*, null and self</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="64">
     <td><a href="https://cplusplus.github.io/CWG/issues/64.html">64</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>TC1</td>
     <td>Partial ordering to disambiguate explicit specialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="65">
     <td><a href="https://cplusplus.github.io/CWG/issues/65.html">65</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>Typo in default argument example</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="66">
     <td><a href="https://cplusplus.github.io/CWG/issues/66.html">66</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>NAD</td>
     <td>Visibility of default args vs overloads added after using-declaration</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="67">
     <td><a href="https://cplusplus.github.io/CWG/issues/67.html">67</a></td>
+    <td>[<a href="https://wg21.link/class.static">class.static</a>]</td>
     <td>TC1</td>
     <td>Evaluation of left side of object-expression</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="68">
     <td><a href="https://cplusplus.github.io/CWG/issues/68.html">68</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>TC1</td>
     <td>Grammar does not allow "friend class A&lt;int&gt;;"</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="69">
     <td><a href="https://cplusplus.github.io/CWG/issues/69.html">69</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>TC1</td>
     <td>Storage class specifiers on template declarations</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="70">
     <td><a href="https://cplusplus.github.io/CWG/issues/70.html">70</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>Is an array bound a nondeduced context?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="71">
     <td><a href="https://cplusplus.github.io/CWG/issues/71.html">71</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>NAD</td>
     <td>Incorrect cross reference</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="72">
     <td><a href="https://cplusplus.github.io/CWG/issues/72.html">72</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>dup</td>
     <td>Linkage and storage class specifiers for templates</td>
     <td class="full" align="center">Duplicate of <a href="#69">69</a></td>
   </tr>
   <tr id="73">
     <td><a href="https://cplusplus.github.io/CWG/issues/73.html">73</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>TC1</td>
     <td>Pointer equality</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1652">1652</a></td>
   </tr>
   <tr id="74">
     <td><a href="https://cplusplus.github.io/CWG/issues/74.html">74</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>TC1</td>
     <td>Enumeration value in direct-new-declarator</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="75">
     <td><a href="https://cplusplus.github.io/CWG/issues/75.html">75</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>In-class initialized members must be const</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="76">
     <td><a href="https://cplusplus.github.io/CWG/issues/76.html">76</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>TC1</td>
     <td>Are const volatile variables considered "constant expressions"?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="77">
     <td><a href="https://cplusplus.github.io/CWG/issues/77.html">77</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD1</td>
     <td>The definition of friend does not allow nested classes to be friends</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="78">
     <td><a href="https://cplusplus.github.io/CWG/issues/78.html">78</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Section 8.5 paragraph 9 should state it only applies to non-static objects</td>
     <td class="none" align="center">Superseded by <a href="#????">????</a></td>
   </tr>
   <tr id="79">
     <td><a href="https://cplusplus.github.io/CWG/issues/79.html">79</a></td>
+    <td>[<a href="https://wg21.link/new.delete.placement">new.delete.placement</a>]</td>
     <td>dup</td>
     <td>Alignment and placement new</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="80">
     <td><a href="https://cplusplus.github.io/CWG/issues/80.html">80</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>Class members with same name as class</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="81">
     <td><a href="https://cplusplus.github.io/CWG/issues/81.html">81</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>NAD</td>
     <td>Null pointers and C compatibility</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="82">
     <td><a href="https://cplusplus.github.io/CWG/issues/82.html">82</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>dup</td>
     <td>Definition of "using" a constant expression</td>
     <td class="full" align="center">Duplicate of <a href="#48">48</a></td>
   </tr>
   <tr id="83">
     <td><a href="https://cplusplus.github.io/CWG/issues/83.html">83</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>TC1</td>
     <td>Overloading and deprecated conversion of string literal</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="84">
     <td><a href="https://cplusplus.github.io/CWG/issues/84.html">84</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>TC1</td>
     <td>Overloading and conversion loophole used by <TT>auto_ptr</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="85">
     <td><a href="https://cplusplus.github.io/CWG/issues/85.html">85</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>TC1</td>
     <td>Redeclaration of member class</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="86">
     <td><a href="https://cplusplus.github.io/CWG/issues/86.html">86</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Lifetime of temporaries in query expressions</td>
     <td class="full" align="center">Duplicate of <a href="#446">446</a></td>
   </tr>
   <tr id="87">
     <td><a href="https://cplusplus.github.io/CWG/issues/87.html">87</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD1</td>
     <td>Exception specifications on function parameters</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="88">
     <td><a href="https://cplusplus.github.io/CWG/issues/88.html">88</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Specialization of member constant templates</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="89">
     <td><a href="https://cplusplus.github.io/CWG/issues/89.html">89</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>TC1</td>
     <td>Object lifetime does not account for reference rebinding</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="90">
     <td><a href="https://cplusplus.github.io/CWG/issues/90.html">90</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Should the enclosing class be an "associated class" too?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="91">
     <td><a href="https://cplusplus.github.io/CWG/issues/91.html">91</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>A union's associated types should include the union itself</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="92">
     <td><a href="https://cplusplus.github.io/CWG/issues/92.html">92</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Should <I>exception-specification</I>s be part of the type system?</td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr id="93">
     <td><a href="https://cplusplus.github.io/CWG/issues/93.html">93</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>TC1</td>
     <td>Missing word in 3.8 <U>basic.life</U> paragraph 2</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="94">
     <td><a href="https://cplusplus.github.io/CWG/issues/94.html">94</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>TC1</td>
     <td>Inconsistencies in the descriptions of constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="95">
     <td><a href="https://cplusplus.github.io/CWG/issues/95.html">95</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Elaborated type specifiers referencing names declared in friend decls</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="96">
     <td><a href="https://cplusplus.github.io/CWG/issues/96.html">96</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++11</td>
     <td>Syntactic disambiguation using the <TT>template</TT> keyword</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1787">P1787</a></td>
   </tr>
   <tr id="97">
     <td><a href="https://cplusplus.github.io/CWG/issues/97.html">97</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Use of bool constants in integral constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="98">
     <td><a href="https://cplusplus.github.io/CWG/issues/98.html">98</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>TC1</td>
     <td>Branching into try block</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="99">
     <td><a href="https://cplusplus.github.io/CWG/issues/99.html">99</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Partial ordering, references and cv-qualifiers</td>
     <td class="full-superseded" align="center">Superseded by <a href="#214">214</a></td>
   </tr>
   <tr id="100">
     <td><a href="https://cplusplus.github.io/CWG/issues/100.html">100</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>TC1</td>
     <td>Clarify why string literals are not allowed as template arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="101">
     <td><a href="https://cplusplus.github.io/CWG/issues/101.html">101</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>TC1</td>
     <td>Redeclaration of extern "C" names via using-declarations</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="102">
     <td><a href="https://cplusplus.github.io/CWG/issues/102.html">102</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>NAD</td>
     <td>Operator lookup rules do not work well with parts of the library</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="103">
     <td><a href="https://cplusplus.github.io/CWG/issues/103.html">103</a></td>
+    <td>[<a href="https://wg21.link/namespace.udir">namespace.udir</a>]</td>
     <td>TC1</td>
     <td>Is it <I>extended-namespace-definition</I> or <I>extension-namespace-definition</I> ?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="104">
     <td><a href="https://cplusplus.github.io/CWG/issues/104.html">104</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>NAD</td>
     <td>Destroying the exception temp when no handler is found</td>
     <td class="na" align="center">N/A (Library DR)</td>
   </tr>
   <tr id="105">
     <td><a href="https://cplusplus.github.io/CWG/issues/105.html">105</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Meaning of "template function"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="106">
     <td><a href="https://cplusplus.github.io/CWG/issues/106.html">106</a></td>
+    <td>[<a href="https://wg21.link/unknown">unknown</a>]</td>
     <td>CD1</td>
     <td>Creating references to references during template deduction/instantiation</td>
     <td class="full-superseded" align="center">Superseded by <a href="#540">540</a></td>
   </tr>
   <tr id="107">
     <td><a href="https://cplusplus.github.io/CWG/issues/107.html">107</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Linkage of operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="108">
     <td><a href="https://cplusplus.github.io/CWG/issues/108.html">108</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>TC1</td>
     <td>Are classes nested in templates dependent?</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="109">
     <td><a href="https://cplusplus.github.io/CWG/issues/109.html">109</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td>Allowing <TT>::template</TT> in <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="110">
     <td><a href="https://cplusplus.github.io/CWG/issues/110.html">110</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD6</td>
     <td>Can template functions and classes be declared in the same scope?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="111">
     <td><a href="https://cplusplus.github.io/CWG/issues/111.html">111</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Copy constructors and cv-qualifiers</td>
     <td class="full" align="center">Duplicate of <a href="#535">535</a></td>
   </tr>
   <tr id="112">
     <td><a href="https://cplusplus.github.io/CWG/issues/112.html">112</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD1</td>
     <td>Array types and cv-qualifiers</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="113">
     <td><a href="https://cplusplus.github.io/CWG/issues/113.html">113</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Visibility of called function</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="114">
     <td><a href="https://cplusplus.github.io/CWG/issues/114.html">114</a></td>
+    <td>[<a href="https://wg21.link/temp.mem">temp.mem</a>]</td>
     <td>NAD</td>
     <td>Virtual overriding by template member function specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="115">
     <td><a href="https://cplusplus.github.io/CWG/issues/115.html">115</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD1</td>
     <td>Address of template-id</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="116">
     <td><a href="https://cplusplus.github.io/CWG/issues/116.html">116</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>TC1</td>
     <td>Equivalent and functionally-equivalent function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="117">
     <td><a href="https://cplusplus.github.io/CWG/issues/117.html">117</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Timing of destruction of temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="118">
     <td><a href="https://cplusplus.github.io/CWG/issues/118.html">118</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Calls via pointers to virtual member functions</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="119">
     <td><a href="https://cplusplus.github.io/CWG/issues/119.html">119</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Object lifetime and aggregate initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="120">
     <td><a href="https://cplusplus.github.io/CWG/issues/120.html">120</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td>Nonexistent non-terminal <I>qualified-name</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="121">
     <td><a href="https://cplusplus.github.io/CWG/issues/121.html">121</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td>Dependent type names with non-dependent <I>nested-name-specifier</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="122">
     <td><a href="https://cplusplus.github.io/CWG/issues/122.html">122</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD1</td>
     <td><I>template-id</I>s as <I>unqualified-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="123">
     <td><a href="https://cplusplus.github.io/CWG/issues/123.html">123</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>TC1</td>
     <td>Bad cross-reference</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="124">
     <td><a href="https://cplusplus.github.io/CWG/issues/124.html">124</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Lifetime of temporaries in default initialization of class arrays</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="125">
     <td><a href="https://cplusplus.github.io/CWG/issues/125.html">125</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD1</td>
     <td>Ambiguity in <TT>friend</TT> declaration syntax</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="126">
     <td><a href="https://cplusplus.github.io/CWG/issues/126.html">126</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>TC1</td>
     <td>Exception specifications and <TT>const</TT></td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="127">
     <td><a href="https://cplusplus.github.io/CWG/issues/127.html">127</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>TC1</td>
     <td>Ambiguity in description of matching deallocation function</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="128">
     <td><a href="https://cplusplus.github.io/CWG/issues/128.html">128</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td>Casting between enum types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="129">
     <td><a href="https://cplusplus.github.io/CWG/issues/129.html">129</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD3</td>
     <td>Stability of uninitialized auto variables</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="130">
     <td><a href="https://cplusplus.github.io/CWG/issues/130.html">130</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Sequence points and <I>new-expression</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="131">
     <td><a href="https://cplusplus.github.io/CWG/issues/131.html">131</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>TC1</td>
     <td>Typo in Lao characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="132">
     <td><a href="https://cplusplus.github.io/CWG/issues/132.html">132</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>Local types and linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="133">
     <td><a href="https://cplusplus.github.io/CWG/issues/133.html">133</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications and checking</td>
     <td class="none" align="center">Duplicate of <a href="#87">87</a></td>
   </tr>
   <tr id="134">
     <td><a href="https://cplusplus.github.io/CWG/issues/134.html">134</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>TC1</td>
     <td>Template classes and <I>declarator-id</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="135">
     <td><a href="https://cplusplus.github.io/CWG/issues/135.html">135</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>TC1</td>
     <td>Class type in in-class member function definitions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="136">
     <td><a href="https://cplusplus.github.io/CWG/issues/136.html">136</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD1</td>
     <td>Default arguments and friend declarations</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="137">
     <td><a href="https://cplusplus.github.io/CWG/issues/137.html">137</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>TC1</td>
     <td><TT>static_cast</TT> of <I>cv</I> <TT>void*</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="138">
     <td><a href="https://cplusplus.github.io/CWG/issues/138.html">138</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD6</td>
     <td>Friend declaration name lookup</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="139">
     <td><a href="https://cplusplus.github.io/CWG/issues/139.html">139</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD1</td>
     <td>Error in <TT>friend</TT> lookup example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="140">
     <td><a href="https://cplusplus.github.io/CWG/issues/140.html">140</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Agreement of parameter declarations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="141">
     <td><a href="https://cplusplus.github.io/CWG/issues/141.html">141</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Non-member function templates in member access expressions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="142">
     <td><a href="https://cplusplus.github.io/CWG/issues/142.html">142</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>TC1</td>
     <td>Injection-related errors in access example</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="143">
     <td><a href="https://cplusplus.github.io/CWG/issues/143.html">143</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Friends and Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="144">
     <td><a href="https://cplusplus.github.io/CWG/issues/144.html">144</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>open</td>
     <td>Position of <TT>friend</TT> specifier</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="145">
     <td><a href="https://cplusplus.github.io/CWG/issues/145.html">145</a></td>
+    <td>[<a href="https://wg21.link/depr.impldec">depr.impldec</a>]</td>
     <td>TC1</td>
     <td>Deprecation of prefix <TT>++</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="146">
     <td><a href="https://cplusplus.github.io/CWG/issues/146.html">146</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>Floating-point zero</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="147">
     <td><a href="https://cplusplus.github.io/CWG/issues/147.html">147</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>TC1</td>
     <td>Naming the constructor</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="148">
     <td><a href="https://cplusplus.github.io/CWG/issues/148.html">148</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>TC1</td>
     <td>POD classes and pointers to members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="149">
     <td><a href="https://cplusplus.github.io/CWG/issues/149.html">149</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>TC1</td>
     <td>Accessibility and ambiguity</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="150">
     <td><a href="https://cplusplus.github.io/CWG/issues/150.html">150</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>C++17</td>
     <td>Template template parameters and default arguments</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="151">
     <td><a href="https://cplusplus.github.io/CWG/issues/151.html">151</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Terminology of zero-initialization</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="152">
     <td><a href="https://cplusplus.github.io/CWG/issues/152.html">152</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>TC1</td>
     <td><TT>explicit</TT> copy constructors</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="153">
     <td><a href="https://cplusplus.github.io/CWG/issues/153.html">153</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>TC1</td>
     <td>Misleading wording (rank of conversion)</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="154">
     <td><a href="https://cplusplus.github.io/CWG/issues/154.html">154</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>NAD</td>
     <td>Anonymous unions in unnamed namespaces</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="155">
     <td><a href="https://cplusplus.github.io/CWG/issues/155.html">155</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>dup</td>
     <td>Brace initializer for scalar</td>
     <td class="full" align="center">Duplicate of <a href="#632">632</a></td>
   </tr>
   <tr id="156">
     <td><a href="https://cplusplus.github.io/CWG/issues/156.html">156</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>NAD</td>
     <td>Name lookup for conversion functions</td>
     <td class="partial-superseded" align="center">Superseded by <a href="#1111">1111</a></td>
   </tr>
   <tr class="open" id="157">
     <td><a href="https://cplusplus.github.io/CWG/issues/157.html">157</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>open</td>
     <td>Omitted typedef declarator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="158">
     <td><a href="https://cplusplus.github.io/CWG/issues/158.html">158</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD1</td>
     <td>Aliasing and qualification conversions</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="159">
     <td><a href="https://cplusplus.github.io/CWG/issues/159.html">159</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>TC1</td>
     <td>Namespace qualification in declarators</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="160">
     <td><a href="https://cplusplus.github.io/CWG/issues/160.html">160</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>CD1</td>
     <td>Missing <TT>std::</TT> qualification</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="161">
     <td><a href="https://cplusplus.github.io/CWG/issues/161.html">161</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>TC1</td>
     <td>Access to protected nested type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="162">
     <td><a href="https://cplusplus.github.io/CWG/issues/162.html">162</a></td>
+    <td>[<a href="https://wg21.link/over.match.call">over.match.call</a>]</td>
     <td>CD1</td>
     <td>(<TT>&amp;C::f)()</TT> with nonstatic members</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="163">
     <td><a href="https://cplusplus.github.io/CWG/issues/163.html">163</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>TC1</td>
     <td>Description of subaggregate initializer</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="164">
     <td><a href="https://cplusplus.github.io/CWG/issues/164.html">164</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>TC1</td>
     <td>Overlap between Koenig and normal lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="165">
     <td><a href="https://cplusplus.github.io/CWG/issues/165.html">165</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Definitions of friends and block-scope externs</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="166">
     <td><a href="https://cplusplus.github.io/CWG/issues/166.html">166</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>TC1</td>
     <td>Friend declarations of <I>template-id</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="167">
     <td><a href="https://cplusplus.github.io/CWG/issues/167.html">167</a></td>
+    <td>[<a href="https://wg21.link/depr.static">depr.static</a>]</td>
     <td>NAD</td>
     <td>Deprecating static functions</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1012">1012</a></td>
   </tr>
   <tr id="168">
     <td><a href="https://cplusplus.github.io/CWG/issues/168.html">168</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>C linkage for static member functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="169">
     <td><a href="https://cplusplus.github.io/CWG/issues/169.html">169</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td><I>template-id</I>s in <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="170">
     <td><a href="https://cplusplus.github.io/CWG/issues/170.html">170</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>CD7</td>
     <td>Pointer-to-member conversions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="171">
     <td><a href="https://cplusplus.github.io/CWG/issues/171.html">171</a></td>
+    <td>[<a href="https://wg21.link/basic.namespace">basic.namespace</a>]</td>
     <td>TC1</td>
     <td>Global namespace scope</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="172">
     <td><a href="https://cplusplus.github.io/CWG/issues/172.html">172</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Unsigned int as underlying type of enum</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="173">
     <td><a href="https://cplusplus.github.io/CWG/issues/173.html">173</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>TC1</td>
     <td>Constraints on execution character set</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="174">
     <td><a href="https://cplusplus.github.io/CWG/issues/174.html">174</a></td>
+    <td>[<a href="https://wg21.link/depr.static">depr.static</a>]</td>
     <td>NAD</td>
     <td>Undeprecating global static</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1012">1012</a></td>
   </tr>
   <tr id="175">
     <td><a href="https://cplusplus.github.io/CWG/issues/175.html">175</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Class name injection and base name access</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="176">
     <td><a href="https://cplusplus.github.io/CWG/issues/176.html">176</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>TC1</td>
     <td>Name injection and templates</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="177">
     <td><a href="https://cplusplus.github.io/CWG/issues/177.html">177</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Lvalues vs rvalues in copy-initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="178">
     <td><a href="https://cplusplus.github.io/CWG/issues/178.html">178</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>More on value-initialization</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="179">
     <td><a href="https://cplusplus.github.io/CWG/issues/179.html">179</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>TC1</td>
     <td>Function pointers and subtraction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="180">
     <td><a href="https://cplusplus.github.io/CWG/issues/180.html">180</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td><TT>typename</TT> and elaborated types</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="181">
     <td><a href="https://cplusplus.github.io/CWG/issues/181.html">181</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>TC1</td>
     <td>Errors in template <I>template-parameter</I> example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="182">
     <td><a href="https://cplusplus.github.io/CWG/issues/182.html">182</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Access checking on explicit specializations</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="183">
     <td><a href="https://cplusplus.github.io/CWG/issues/183.html">183</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>TC1</td>
     <td><TT>typename</TT> in explicit specializations</td>
     <td class="full-superseded" align="center">Superseded by <a href="#382">382</a></td>
   </tr>
   <tr id="184">
     <td><a href="https://cplusplus.github.io/CWG/issues/184.html">184</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Default arguments in template <I>template-parameter</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="185">
     <td><a href="https://cplusplus.github.io/CWG/issues/185.html">185</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>TC1</td>
     <td>"Named" temporaries and copy elision</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="186">
     <td><a href="https://cplusplus.github.io/CWG/issues/186.html">186</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>open</td>
     <td>Name hiding and template <I>template-parameter</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="187">
     <td><a href="https://cplusplus.github.io/CWG/issues/187.html">187</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>TC1</td>
     <td>Scope of template parameter names</td>
     <td class="full-superseded" align="center">Superseded by <a href="#481">481</a></td>
   </tr>
   <tr id="188">
     <td><a href="https://cplusplus.github.io/CWG/issues/188.html">188</a></td>
+    <td>[<a href="https://wg21.link/expr.comma">expr.comma</a>]</td>
     <td>TC1</td>
     <td>Comma operator and rvalue conversion</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="189">
     <td><a href="https://cplusplus.github.io/CWG/issues/189.html">189</a></td>
+    <td>[<a href="https://wg21.link/lex.operators">lex.operators</a>]</td>
     <td>open</td>
     <td>Definition of <I>operator</I> and <I>punctuator</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="190">
     <td><a href="https://cplusplus.github.io/CWG/issues/190.html">190</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>TC1</td>
     <td>Layout-compatible POD-struct types</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="191">
     <td><a href="https://cplusplus.github.io/CWG/issues/191.html">191</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Name lookup does not handle complex nesting</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="192">
     <td><a href="https://cplusplus.github.io/CWG/issues/192.html">192</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Name lookup in parameters</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="193">
     <td><a href="https://cplusplus.github.io/CWG/issues/193.html">193</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>TC1</td>
     <td>Order of destruction of local automatics of destructor</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="194">
     <td><a href="https://cplusplus.github.io/CWG/issues/194.html">194</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>TC1</td>
     <td>Identifying constructors</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="195">
     <td><a href="https://cplusplus.github.io/CWG/issues/195.html">195</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD1</td>
     <td>Converting between function and object pointers</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="196">
     <td><a href="https://cplusplus.github.io/CWG/issues/196.html">196</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Arguments to deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="197">
     <td><a href="https://cplusplus.github.io/CWG/issues/197.html">197</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD1</td>
     <td>Issues with two-stage lookup of dependent names</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="198">
     <td><a href="https://cplusplus.github.io/CWG/issues/198.html">198</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>CD1</td>
     <td>Definition of "use" in local and nested classes</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="199">
     <td><a href="https://cplusplus.github.io/CWG/issues/199.html">199</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of temporaries</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="200">
     <td><a href="https://cplusplus.github.io/CWG/issues/200.html">200</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>dup</td>
     <td>Partial ordering and explicit arguments</td>
     <td class="full" align="center">Duplicate of <a href="#214">214</a></td>
   </tr>
   <tr id="201">
     <td><a href="https://cplusplus.github.io/CWG/issues/201.html">201</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of temporaries in initializers</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="202">
     <td><a href="https://cplusplus.github.io/CWG/issues/202.html">202</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>TC1</td>
     <td>Use of overloaded function name</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="203">
     <td><a href="https://cplusplus.github.io/CWG/issues/203.html">203</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Type of address-of-member expression</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="204">
     <td><a href="https://cplusplus.github.io/CWG/issues/204.html">204</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Exported class templates</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr class="open" id="205">
     <td><a href="https://cplusplus.github.io/CWG/issues/205.html">205</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>drafting</td>
     <td>Templates and static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="206">
     <td><a href="https://cplusplus.github.io/CWG/issues/206.html">206</a></td>
+    <td>[<a href="https://wg21.link/temp.nondep">temp.nondep</a>]</td>
     <td>TC1</td>
     <td>Semantic constraints on non-dependent names</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="207">
     <td><a href="https://cplusplus.github.io/CWG/issues/207.html">207</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD1</td>
     <td><I>using-declaration</I>s and protected access</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="208">
     <td><a href="https://cplusplus.github.io/CWG/issues/208.html">208</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Rethrowing exceptions in nested handlers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="209">
     <td><a href="https://cplusplus.github.io/CWG/issues/209.html">209</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
-    <td>Must friend declaration names be
-accessible?</td>
+    <td>Must friend declaration names be accessible?</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="210">
     <td><a href="https://cplusplus.github.io/CWG/issues/210.html">210</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>TC1</td>
     <td>What is the type matched by an exception handler?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="211">
     <td><a href="https://cplusplus.github.io/CWG/issues/211.html">211</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>NAD</td>
     <td>Constructors should not be allowed to return normally after an exception</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="212">
     <td><a href="https://cplusplus.github.io/CWG/issues/212.html">212</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD4</td>
     <td>Implicit instantiation is not described clearly enough</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="213">
     <td><a href="https://cplusplus.github.io/CWG/issues/213.html">213</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>TC1</td>
     <td>Lookup in dependent base classes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="214">
     <td><a href="https://cplusplus.github.io/CWG/issues/214.html">214</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD1</td>
     <td>Partial ordering of function templates is underspecified</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="215">
     <td><a href="https://cplusplus.github.io/CWG/issues/215.html">215</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Template parameters are not allowed in <I>nested-name-specifier</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="216">
     <td><a href="https://cplusplus.github.io/CWG/issues/216.html">216</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Linkage of nameless class-scope enumeration types</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="217">
     <td><a href="https://cplusplus.github.io/CWG/issues/217.html">217</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>TC1</td>
     <td>Default arguments for non-template member functions of class templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="218">
     <td><a href="https://cplusplus.github.io/CWG/issues/218.html">218</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Specification of Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="219">
     <td><a href="https://cplusplus.github.io/CWG/issues/219.html">219</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>NAD</td>
     <td>Cannot defend against destructors that throw exceptions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="220">
     <td><a href="https://cplusplus.github.io/CWG/issues/220.html">220</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD1</td>
     <td>All deallocation functions should be required not to throw</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="221">
     <td><a href="https://cplusplus.github.io/CWG/issues/221.html">221</a></td>
+    <td>[<a href="https://wg21.link/over.assign">over.assign</a>]</td>
     <td>CD1</td>
     <td>Must compound assignment operators be member functions?</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="222">
     <td><a href="https://cplusplus.github.io/CWG/issues/222.html">222</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Sequence points and lvalue-returning operators</td>
     <td class="full" align="center">Duplicate of <a href="#637">637</a></td>
   </tr>
   <tr id="223">
     <td><a href="https://cplusplus.github.io/CWG/issues/223.html">223</a></td>
+    <td>[<a href="https://wg21.link/depr">depr</a>]</td>
     <td>CD3</td>
     <td>The meaning of deprecation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="224">
     <td><a href="https://cplusplus.github.io/CWG/issues/224.html">224</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD1</td>
     <td>Definition of dependent names</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="225">
     <td><a href="https://cplusplus.github.io/CWG/issues/225.html">225</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Koenig lookup and fundamental types</td>
     <td class="full" align="center">Yes</td>
   </tr>
   <tr id="226">
     <td><a href="https://cplusplus.github.io/CWG/issues/226.html">226</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>Default template arguments for function templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="227">
     <td><a href="https://cplusplus.github.io/CWG/issues/227.html">227</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>TC1</td>
     <td>How many scopes in an <TT>if</TT> statement?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="228">
     <td><a href="https://cplusplus.github.io/CWG/issues/228.html">228</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Use of <TT>template</TT> keyword with non-member templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="229">
     <td><a href="https://cplusplus.github.io/CWG/issues/229.html">229</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>NAD</td>
     <td>Partial specialization of function templates</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="230">
     <td><a href="https://cplusplus.github.io/CWG/issues/230.html">230</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>NAD</td>
     <td>Calls to pure virtual functions</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="231">
     <td><a href="https://cplusplus.github.io/CWG/issues/231.html">231</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Visibility of names after <I>using-directive</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="232">
     <td><a href="https://cplusplus.github.io/CWG/issues/232.html">232</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Is indirection through a null pointer undefined behavior?</td>
     <td class="none" align="center">Duplicate of <a href="#2823">2823</a></td>
   </tr>
   <tr id="233">
     <td><a href="https://cplusplus.github.io/CWG/issues/233.html">233</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>References vs pointers in UDC overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="234">
     <td><a href="https://cplusplus.github.io/CWG/issues/234.html">234</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Reuse of base class subobjects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="235">
     <td><a href="https://cplusplus.github.io/CWG/issues/235.html">235</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>TC1</td>
     <td>Assignment vs initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="236">
     <td><a href="https://cplusplus.github.io/CWG/issues/236.html">236</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Explicit temporaries and integral constant expressions</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="237">
     <td><a href="https://cplusplus.github.io/CWG/issues/237.html">237</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>Explicit instantiation and base class members</td>
     <td class="full" align="center">Duplicate of <a href="#470">470</a></td>
   </tr>
   <tr id="238">
     <td><a href="https://cplusplus.github.io/CWG/issues/238.html">238</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD4</td>
     <td>Precision and accuracy constraints on floating point</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="239">
     <td><a href="https://cplusplus.github.io/CWG/issues/239.html">239</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>CD1</td>
     <td>Footnote 116 and Koenig lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="240">
     <td><a href="https://cplusplus.github.io/CWG/issues/240.html">240</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD3</td>
     <td>Uninitialized values and undefined behavior</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="241">
     <td><a href="https://cplusplus.github.io/CWG/issues/241.html">241</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>TC1</td>
     <td>Error in example in 14.8.1</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="242">
     <td><a href="https://cplusplus.github.io/CWG/issues/242.html">242</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD4</td>
     <td>Interpretation of old-style casts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="243">
     <td><a href="https://cplusplus.github.io/CWG/issues/243.html">243</a></td>
+    <td>[<a href="https://wg21.link/over.ics.user">over.ics.user</a>]</td>
     <td>NAD</td>
     <td>Weighting of conversion functions in direct-initialization</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="244">
     <td><a href="https://cplusplus.github.io/CWG/issues/244.html">244</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Destructor lookup</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="245">
     <td><a href="https://cplusplus.github.io/CWG/issues/245.html">245</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>CD1</td>
     <td>Name lookup in <I>elaborated-type-specifier</I>s</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="246">
     <td><a href="https://cplusplus.github.io/CWG/issues/246.html">246</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD1</td>
     <td>Jumps in <I>function-try-block</I> handlers</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="247">
     <td><a href="https://cplusplus.github.io/CWG/issues/247.html">247</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>NAD</td>
     <td>Pointer-to-member casts and function overload resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="248">
     <td><a href="https://cplusplus.github.io/CWG/issues/248.html">248</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>C++11</td>
     <td>Identifier characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="249">
     <td><a href="https://cplusplus.github.io/CWG/issues/249.html">249</a></td>
+    <td>[<a href="https://wg21.link/temp.mem.func">temp.mem.func</a>]</td>
     <td>TC1</td>
     <td>What is a member function template?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="250">
     <td><a href="https://cplusplus.github.io/CWG/issues/250.html">250</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>TC1</td>
     <td>Address of function template specialization with non-deduced template arguments</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="251">
     <td><a href="https://cplusplus.github.io/CWG/issues/251.html">251</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>How many signed integer types are there?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="252">
     <td><a href="https://cplusplus.github.io/CWG/issues/252.html">252</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Looking up deallocation functions in virtual destructors</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="253">
     <td><a href="https://cplusplus.github.io/CWG/issues/253.html">253</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++17</td>
     <td>Why must empty or fully-initialized const objects be initialized?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="254">
     <td><a href="https://cplusplus.github.io/CWG/issues/254.html">254</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.elab">basic.lookup.elab</a>]</td>
     <td>CD1</td>
     <td>Definitional problems with <I>elaborated-type-specifier</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="255">
     <td><a href="https://cplusplus.github.io/CWG/issues/255.html">255</a></td>
+    <td>[<a href="https://wg21.link/class.free">class.free</a>]</td>
     <td>CD6</td>
     <td>Placement deallocation functions and lookup ambiguity</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="256">
     <td><a href="https://cplusplus.github.io/CWG/issues/256.html">256</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Overflow in size calculations</td>
     <td class="unknown" align="center">Duplicate of <a href="#624">624</a></td>
   </tr>
   <tr id="257">
     <td><a href="https://cplusplus.github.io/CWG/issues/257.html">257</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD2</td>
     <td>Abstract base constructors and virtual base initialization</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="258">
     <td><a href="https://cplusplus.github.io/CWG/issues/258.html">258</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td><I>using-declaration</I>s and cv-qualifiers</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="259">
     <td><a href="https://cplusplus.github.io/CWG/issues/259.html">259</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD1</td>
     <td>Restrictions on explicit specialization and instantiation</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr class="open" id="260">
     <td><a href="https://cplusplus.github.io/CWG/issues/260.html">260</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>open</td>
     <td>User-defined conversions and built-in <TT>operator=</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="261">
     <td><a href="https://cplusplus.github.io/CWG/issues/261.html">261</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD1</td>
     <td>When is a deallocation function "used?"</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="262">
     <td><a href="https://cplusplus.github.io/CWG/issues/262.html">262</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Default arguments and ellipsis</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="263">
     <td><a href="https://cplusplus.github.io/CWG/issues/263.html">263</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Can a constructor be declared a friend?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr class="open" id="264">
     <td><a href="https://cplusplus.github.io/CWG/issues/264.html">264</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>open</td>
     <td>Unusable template constructors and conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="265">
     <td><a href="https://cplusplus.github.io/CWG/issues/265.html">265</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>dup</td>
     <td>Destructors, exceptions, and deallocation</td>
     <td class="unknown" align="center">Duplicate of <a href="#353">353</a></td>
   </tr>
   <tr id="266">
     <td><a href="https://cplusplus.github.io/CWG/issues/266.html">266</a></td>
+    <td>[<a href="https://wg21.link/gram">gram</a>]</td>
     <td>NAD</td>
     <td>No grammar sentence symbol</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="267">
     <td><a href="https://cplusplus.github.io/CWG/issues/267.html">267</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Alignment requirement for <I>new-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="268">
     <td><a href="https://cplusplus.github.io/CWG/issues/268.html">268</a></td>
+    <td>[<a href="https://wg21.link/cpp.rescan">cpp.rescan</a>]</td>
     <td>open</td>
     <td>Macro name suppression in rescanned replacement text</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="269">
     <td><a href="https://cplusplus.github.io/CWG/issues/269.html">269</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
-    <td>Order of initialization of multiply-defined static data members
-of class templates</td>
+    <td>Order of initialization of multiply-defined static data members of class templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="270">
     <td><a href="https://cplusplus.github.io/CWG/issues/270.html">270</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Order of initialization of static data members of class templates</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="271">
     <td><a href="https://cplusplus.github.io/CWG/issues/271.html">271</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Explicit instantiation and template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="272">
     <td><a href="https://cplusplus.github.io/CWG/issues/272.html">272</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Explicit destructor invocation and <I>qualified-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="273">
     <td><a href="https://cplusplus.github.io/CWG/issues/273.html">273</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>POD classes and <TT>operator&amp;()</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="274">
     <td><a href="https://cplusplus.github.io/CWG/issues/274.html">274</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Cv-qualification and char-alias access to out-of-lifetime objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="275">
     <td><a href="https://cplusplus.github.io/CWG/issues/275.html">275</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Explicit instantiation/specialization and <I>using-directive</I>s</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="276">
     <td><a href="https://cplusplus.github.io/CWG/issues/276.html">276</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>CD1</td>
     <td>Order of destruction of parameters and temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="277">
     <td><a href="https://cplusplus.github.io/CWG/issues/277.html">277</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Zero-initialization of pointers</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="278">
     <td><a href="https://cplusplus.github.io/CWG/issues/278.html">278</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>External linkage and nameless entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="279">
     <td><a href="https://cplusplus.github.io/CWG/issues/279.html">279</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Correspondence of "names for linkage purposes"</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="280">
     <td><a href="https://cplusplus.github.io/CWG/issues/280.html">280</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>CD1</td>
     <td>Access and surrogate call functions</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="281">
     <td><a href="https://cplusplus.github.io/CWG/issues/281.html">281</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td><TT>inline</TT> specifier in <TT>friend</TT> declarations</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="282">
     <td><a href="https://cplusplus.github.io/CWG/issues/282.html">282</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>open</td>
     <td>Namespace for <TT>extended_type_info</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="283">
     <td><a href="https://cplusplus.github.io/CWG/issues/283.html">283</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Template <I>type-parameter</I>s are not syntactically <I>type-name</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="284">
     <td><a href="https://cplusplus.github.io/CWG/issues/284.html">284</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td><I>qualified-id</I>s in class declarations</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="285">
     <td><a href="https://cplusplus.github.io/CWG/issues/285.html">285</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Identifying a function template being specialized</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="286">
     <td><a href="https://cplusplus.github.io/CWG/issues/286.html">286</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD1</td>
     <td>Incorrect example in partial specialization</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="287">
     <td><a href="https://cplusplus.github.io/CWG/issues/287.html">287</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Order dependencies in template instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="288">
     <td><a href="https://cplusplus.github.io/CWG/issues/288.html">288</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Misuse of "static type" in describing pointers</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="289">
     <td><a href="https://cplusplus.github.io/CWG/issues/289.html">289</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD1</td>
     <td>Incomplete list of contexts requiring a complete type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="290">
     <td><a href="https://cplusplus.github.io/CWG/issues/290.html">290</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Should memcpy be allowed into a POD with a const member?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="291">
     <td><a href="https://cplusplus.github.io/CWG/issues/291.html">291</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Overload resolution needed when binding reference to class rvalue</td>
     <td class="full" align="center">Duplicate of <a href="#391">391</a></td>
   </tr>
   <tr id="292">
     <td><a href="https://cplusplus.github.io/CWG/issues/292.html">292</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>Deallocation on exception in <TT>new</TT> before arguments evaluated</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr class="open" id="293">
     <td><a href="https://cplusplus.github.io/CWG/issues/293.html">293</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>open</td>
     <td>Syntax of explicit instantiation/specialization too permissive</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="294">
     <td><a href="https://cplusplus.github.io/CWG/issues/294.html">294</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>NAD</td>
     <td>Can <TT>static_cast</TT> drop exception specifications?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="295">
     <td><a href="https://cplusplus.github.io/CWG/issues/295.html">295</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>cv-qualifiers on function types</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="296">
     <td><a href="https://cplusplus.github.io/CWG/issues/296.html">296</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD1</td>
     <td>Can conversion functions be static?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="297">
     <td><a href="https://cplusplus.github.io/CWG/issues/297.html">297</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Which template does an explicit specialization specialize?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="298">
     <td><a href="https://cplusplus.github.io/CWG/issues/298.html">298</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD1</td>
     <td><TT>T::x</TT> when <TT>T</TT> is cv-qualified</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="299">
     <td><a href="https://cplusplus.github.io/CWG/issues/299.html">299</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Conversion on array bound expression in <TT>new</TT></td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="300">
     <td><a href="https://cplusplus.github.io/CWG/issues/300.html">300</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>References to functions in template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="301">
     <td><a href="https://cplusplus.github.io/CWG/issues/301.html">301</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Syntax for <I>template-name</I></td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="302">
     <td><a href="https://cplusplus.github.io/CWG/issues/302.html">302</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Value-initialization and generation of default constructor</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="303">
     <td><a href="https://cplusplus.github.io/CWG/issues/303.html">303</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>NAD</td>
     <td>Integral promotions on bit-fields</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="304">
     <td><a href="https://cplusplus.github.io/CWG/issues/304.html">304</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>TC1</td>
     <td>Value-initialization of a reference</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="305">
     <td><a href="https://cplusplus.github.io/CWG/issues/305.html">305</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Name lookup in destructor call</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="306">
     <td><a href="https://cplusplus.github.io/CWG/issues/306.html">306</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD1</td>
     <td>Ambiguity by class name injection</td>
     <td class="none" align="center">Duplicate of <a href="#39">39</a></td>
   </tr>
   <tr id="307">
     <td><a href="https://cplusplus.github.io/CWG/issues/307.html">307</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>NAD</td>
     <td>Initialization of a virtual base class subobject</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="308">
     <td><a href="https://cplusplus.github.io/CWG/issues/308.html">308</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>NAD</td>
     <td>Catching exceptions with ambiguous base classes</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="309">
     <td><a href="https://cplusplus.github.io/CWG/issues/309.html">309</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD1</td>
     <td>Linkage of entities whose names are not simply identifiers, in introduction</td>
     <td class="full" align="center">Duplicate of <a href="#485">485</a></td>
   </tr>
   <tr class="open" id="310">
     <td><a href="https://cplusplus.github.io/CWG/issues/310.html">310</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Can function templates differing only in parameter cv-qualifiers be overloaded?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="311">
     <td><a href="https://cplusplus.github.io/CWG/issues/311.html">311</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>NAD</td>
     <td>Using qualified name to reopen nested namespace</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="312">
     <td><a href="https://cplusplus.github.io/CWG/issues/312.html">312</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD3</td>
     <td>&#8220;use&#8221; of invalid pointer value not defined</td>
     <td class="full" align="center">Duplicate of <a href="#616">616</a></td>
   </tr>
   <tr id="313">
     <td><a href="https://cplusplus.github.io/CWG/issues/313.html">313</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>dup</td>
     <td>Class with single conversion function to integral as array size in <TT>new</TT></td>
     <td class="full" align="center">Duplicate of <a href="#299">299</a> (C++11 onwards)</td>
   </tr>
   <tr id="314">
     <td><a href="https://cplusplus.github.io/CWG/issues/314.html">314</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td><TT>template</TT> in base class specifier</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="315">
     <td><a href="https://cplusplus.github.io/CWG/issues/315.html">315</a></td>
+    <td>[<a href="https://wg21.link/class.static.mfct">class.static.mfct</a>]</td>
     <td>NAD</td>
     <td>Is call of static member function through null pointer undefined?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="316">
     <td><a href="https://cplusplus.github.io/CWG/issues/316.html">316</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>NAD</td>
     <td>Injected-class-name of template used as template template parameter</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1004">1004</a></td>
   </tr>
   <tr id="317">
     <td><a href="https://cplusplus.github.io/CWG/issues/317.html">317</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Can a function be declared inline after it has been called?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="318">
     <td><a href="https://cplusplus.github.io/CWG/issues/318.html">318</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD1</td>
     <td><TT>struct A::A</TT> should not name the constructor of <TT>A</TT></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1310">1310</a></td>
   </tr>
   <tr id="319">
     <td><a href="https://cplusplus.github.io/CWG/issues/319.html">319</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Use of names without linkage in declaring entities with linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="320">
     <td><a href="https://cplusplus.github.io/CWG/issues/320.html">320</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Question on copy constructor elision example</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="321">
     <td><a href="https://cplusplus.github.io/CWG/issues/321.html">321</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>dup</td>
     <td>Associated classes and namespaces for argument-dependent lookup</td>
     <td class="full" align="center">Duplicate of <a href="#557">557</a></td>
   </tr>
   <tr id="322">
     <td><a href="https://cplusplus.github.io/CWG/issues/322.html">322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD1</td>
     <td>Deduction of reference conversions</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="323">
     <td><a href="https://cplusplus.github.io/CWG/issues/323.html">323</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Where must <TT>export</TT> appear?</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr id="324">
     <td><a href="https://cplusplus.github.io/CWG/issues/324.html">324</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD1</td>
     <td>Can "<TT>&amp;</TT>" be applied to assignment to bit-field?</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="325">
     <td><a href="https://cplusplus.github.io/CWG/issues/325.html">325</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>When are default arguments parsed?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="326">
     <td><a href="https://cplusplus.github.io/CWG/issues/326.html">326</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Wording for definition of trivial constructor</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="327">
     <td><a href="https://cplusplus.github.io/CWG/issues/327.html">327</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Use of "structure" without definition</td>
     <td class="na" align="center">Duplicate of <a href="#538">538</a></td>
   </tr>
   <tr id="328">
     <td><a href="https://cplusplus.github.io/CWG/issues/328.html">328</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Missing requirement that class member types be complete</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="329">
     <td><a href="https://cplusplus.github.io/CWG/issues/329.html">329</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD1</td>
     <td>Evaluation of friends of templates</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="330">
     <td><a href="https://cplusplus.github.io/CWG/issues/330.html">330</a></td>
+    <td>[<a href="https://wg21.link/conv.qual">conv.qual</a>]</td>
     <td>CD4</td>
     <td>Qualification conversions and pointers to arrays of pointers</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="331">
     <td><a href="https://cplusplus.github.io/CWG/issues/331.html">331</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD1</td>
     <td>Allowed copy constructor signatures</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="332">
     <td><a href="https://cplusplus.github.io/CWG/issues/332.html">332</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>cv-qualified <TT>void</TT> parameter types</td>
     <td class="full" align="center">Duplicate of <a href="#577">577</a></td>
   </tr>
   <tr id="333">
     <td><a href="https://cplusplus.github.io/CWG/issues/333.html">333</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Ambiguous use of "declaration" in disambiguation section</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="334">
     <td><a href="https://cplusplus.github.io/CWG/issues/334.html">334</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>NAD</td>
     <td>Is a comma-expression dependent if its first operand is?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="335">
     <td><a href="https://cplusplus.github.io/CWG/issues/335.html">335</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td>Allowing <TT>export</TT> on template members of nontemplate classes</td>
     <td class="full-superseded" align="center">Superseded by <a href="#820">820</a></td>
   </tr>
   <tr id="336">
     <td><a href="https://cplusplus.github.io/CWG/issues/336.html">336</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD1</td>
     <td>Explicit specialization examples are still incorrect</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="337">
     <td><a href="https://cplusplus.github.io/CWG/issues/337.html">337</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Attempt to create array of abtract type should cause deduction to fail</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="338">
     <td><a href="https://cplusplus.github.io/CWG/issues/338.html">338</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Enumerator name with linkage used as class name in other translation unit</td>
     <td class="partial" align="center">Duplicate of <a href="#1884">1884</a></td>
   </tr>
   <tr id="339">
     <td><a href="https://cplusplus.github.io/CWG/issues/339.html">339</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Overload resolution in operand of <TT>sizeof</TT> in constant expression</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="340">
     <td><a href="https://cplusplus.github.io/CWG/issues/340.html">340</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Unclear wording in disambiguation section</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="341">
     <td><a href="https://cplusplus.github.io/CWG/issues/341.html">341</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++11</td>
     <td><TT>extern "C"</TT> namespace member function versus global variable</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1708">1708</a></td>
   </tr>
   <tr id="342">
     <td><a href="https://cplusplus.github.io/CWG/issues/342.html">342</a></td>
+    <td>[<a href="https://wg21.link/expr.unary">expr.unary</a>]</td>
     <td>CD3</td>
     <td>Terminology: "indirection" versus "dereference"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="343">
     <td><a href="https://cplusplus.github.io/CWG/issues/343.html">343</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td>Make <TT>template</TT> optional in contexts that require a type</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="344">
     <td><a href="https://cplusplus.github.io/CWG/issues/344.html">344</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD3</td>
     <td>Naming destructors</td>
     <td class="unknown" align="center">Duplicate of <a href="#1435">1435</a></td>
   </tr>
   <tr id="345">
     <td><a href="https://cplusplus.github.io/CWG/issues/345.html">345</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Misleading comment on example in templates chapter</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="346">
     <td><a href="https://cplusplus.github.io/CWG/issues/346.html">346</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Typo in 15.4</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="347">
     <td><a href="https://cplusplus.github.io/CWG/issues/347.html">347</a></td>
+    <td>[<a href="https://wg21.link/class.nest">class.nest</a>]</td>
     <td>NAD</td>
     <td>Use of derived class name in defining base class nested class</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="348">
     <td><a href="https://cplusplus.github.io/CWG/issues/348.html">348</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD1</td>
     <td><TT>delete</TT> and user-written deallocation functions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="349">
     <td><a href="https://cplusplus.github.io/CWG/issues/349.html">349</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD1</td>
     <td>Template argument deduction for conversion functions and qualification conversions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="350">
     <td><a href="https://cplusplus.github.io/CWG/issues/350.html">350</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>open</td>
     <td><TT>signed char</TT> underlying representation for objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="351">
     <td><a href="https://cplusplus.github.io/CWG/issues/351.html">351</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Sequence point error: unspecified or undefined?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="352">
     <td><a href="https://cplusplus.github.io/CWG/issues/352.html">352</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Nondeduced contexts</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="353">
     <td><a href="https://cplusplus.github.io/CWG/issues/353.html">353</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Is deallocation routine called if destructor throws exception in delete?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="354">
     <td><a href="https://cplusplus.github.io/CWG/issues/354.html">354</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD1</td>
     <td>Null as nontype template argument</td>
     <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="355">
     <td><a href="https://cplusplus.github.io/CWG/issues/355.html">355</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Global-scope <TT>::</TT> in <I>nested-name-specifier</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="356">
     <td><a href="https://cplusplus.github.io/CWG/issues/356.html">356</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Wording of behavior of generated copy constructor for scalar members</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="357">
     <td><a href="https://cplusplus.github.io/CWG/issues/357.html">357</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD1</td>
     <td>Definition of signature should include name</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="358">
     <td><a href="https://cplusplus.github.io/CWG/issues/358.html">358</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Namespaces and extern "C"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="359">
     <td><a href="https://cplusplus.github.io/CWG/issues/359.html">359</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>NAD</td>
     <td>Type definition in anonymous union</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="360">
     <td><a href="https://cplusplus.github.io/CWG/issues/360.html">360</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD6</td>
     <td>Using-declaration that reduces access</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="361">
     <td><a href="https://cplusplus.github.io/CWG/issues/361.html">361</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Forward reference to default argument</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="362">
     <td><a href="https://cplusplus.github.io/CWG/issues/362.html">362</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD1</td>
     <td>Order of initialization in instantiation units</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="363">
     <td><a href="https://cplusplus.github.io/CWG/issues/363.html">363</a></td>
+    <td>[<a href="https://wg21.link/class.expl.init">class.expl.init</a>]</td>
     <td>NAD</td>
     <td>Initialization of class from self</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="364">
     <td><a href="https://cplusplus.github.io/CWG/issues/364.html">364</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>CD1</td>
     <td>Calling overloaded function with static in set, with no object</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="365">
     <td><a href="https://cplusplus.github.io/CWG/issues/365.html">365</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>open</td>
     <td>Storage duration and temporaries</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="366">
     <td><a href="https://cplusplus.github.io/CWG/issues/366.html">366</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>String literal allowed in integral constant expression?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="367">
     <td><a href="https://cplusplus.github.io/CWG/issues/367.html">367</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td><TT>throw</TT> operator allowed in constant expression?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="368">
     <td><a href="https://cplusplus.github.io/CWG/issues/368.html">368</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Uses of non-type parameters that should cause deduction to fail</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="369">
     <td><a href="https://cplusplus.github.io/CWG/issues/369.html">369</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>open</td>
     <td>Are <TT>new</TT>/<TT>delete</TT> identifiers or <I>preprocessing-op-or-punc</I>?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="370">
     <td><a href="https://cplusplus.github.io/CWG/issues/370.html">370</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>CD1</td>
     <td>Can <TT>#include &lt;...&gt;</TT> form be used other than for standard C++ headers?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="371">
     <td><a href="https://cplusplus.github.io/CWG/issues/371.html">371</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Interleaving of constructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="372">
     <td><a href="https://cplusplus.github.io/CWG/issues/372.html">372</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD1</td>
     <td>Is access granted by base class specifiers available in following base class specifiers?</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="373">
     <td><a href="https://cplusplus.github.io/CWG/issues/373.html">373</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.udir">basic.lookup.udir</a>]</td>
     <td>C++11</td>
     <td>Lookup on namespace qualified name in using-directive</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="374">
     <td><a href="https://cplusplus.github.io/CWG/issues/374.html">374</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD2</td>
     <td>Can explicit specialization outside namespace use qualified name?</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="375">
     <td><a href="https://cplusplus.github.io/CWG/issues/375.html">375</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>dup</td>
     <td>Confusing example on lookup with <TT>typename</TT></td>
     <td class="full" align="center">Duplicate of <a href="#345">345</a></td>
   </tr>
   <tr id="376">
     <td><a href="https://cplusplus.github.io/CWG/issues/376.html">376</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>NAD</td>
     <td>Class "definition" versus class "declaration"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="377">
     <td><a href="https://cplusplus.github.io/CWG/issues/377.html">377</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Enum whose enumerators will not fit in any integral type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="378">
     <td><a href="https://cplusplus.github.io/CWG/issues/378.html">378</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>CD1</td>
     <td>Wording that says temporaries are declared</td>
     <td class="na" align="center">Duplicate of <a href="#276">276</a></td>
   </tr>
   <tr id="379">
     <td><a href="https://cplusplus.github.io/CWG/issues/379.html">379</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Change "class declaration" to "class definition"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="380">
     <td><a href="https://cplusplus.github.io/CWG/issues/380.html">380</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>open</td>
     <td>Definition of "ambiguous base class" missing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="381">
     <td><a href="https://cplusplus.github.io/CWG/issues/381.html">381</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Incorrect example of base class member lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="382">
     <td><a href="https://cplusplus.github.io/CWG/issues/382.html">382</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Allow <TT>typename</TT> outside of templates</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="383">
     <td><a href="https://cplusplus.github.io/CWG/issues/383.html">383</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Is a class with a declared but not defined destructor a POD?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="384">
     <td><a href="https://cplusplus.github.io/CWG/issues/384.html">384</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Argument-dependent lookup and operator functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="385">
     <td><a href="https://cplusplus.github.io/CWG/issues/385.html">385</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>CD1</td>
     <td>How does protected member check of 11.5 interact with using-declarations?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="386">
     <td><a href="https://cplusplus.github.io/CWG/issues/386.html">386</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td>Friend declaration of name brought in by <I>using-declaration</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="387">
     <td><a href="https://cplusplus.github.io/CWG/issues/387.html">387</a></td>
+    <td>[<a href="https://wg21.link/temp.inject">temp.inject</a>]</td>
     <td>CD1</td>
     <td>Errors in example in 14.6.5</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="388">
     <td><a href="https://cplusplus.github.io/CWG/issues/388.html">388</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD3</td>
     <td>Catching base<TT>*&amp;</TT> from a throw of derived<TT>*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="389">
     <td><a href="https://cplusplus.github.io/CWG/issues/389.html">389</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Unnamed types in entities with linkage</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="390">
     <td><a href="https://cplusplus.github.io/CWG/issues/390.html">390</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>CD1</td>
     <td>Pure virtual must be defined when implicitly called</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="391">
     <td><a href="https://cplusplus.github.io/CWG/issues/391.html">391</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Require direct binding of short-lived references to rvalues</td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="392">
     <td><a href="https://cplusplus.github.io/CWG/issues/392.html">392</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Use of full expression lvalue before temporary destruction</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="393">
     <td><a href="https://cplusplus.github.io/CWG/issues/393.html">393</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD4</td>
     <td>Pointer to array of unknown bound in template argument list in parameter</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="394">
     <td><a href="https://cplusplus.github.io/CWG/issues/394.html">394</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD1</td>
     <td><I>identifier-list</I> is never defined</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="395">
     <td><a href="https://cplusplus.github.io/CWG/issues/395.html">395</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>NAD</td>
     <td>Conversion operator template syntax</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="396">
     <td><a href="https://cplusplus.github.io/CWG/issues/396.html">396</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Misleading note regarding use of <TT>auto</TT> for disambiguation</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="397">
     <td><a href="https://cplusplus.github.io/CWG/issues/397.html">397</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Same address for string literals from default arguments in inline functions?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1823">1823</a></td>
   </tr>
   <tr id="398">
     <td><a href="https://cplusplus.github.io/CWG/issues/398.html">398</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Ambiguous wording on naming a type in deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="399">
     <td><a href="https://cplusplus.github.io/CWG/issues/399.html">399</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD6</td>
     <td>Destructor lookup redux</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="400">
     <td><a href="https://cplusplus.github.io/CWG/issues/400.html">400</a></td>
+    <td>[<a href="https://wg21.link/namespace.qual">namespace.qual</a>]</td>
     <td>CD1</td>
     <td>Using-declarations and the "struct hack"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="401">
     <td><a href="https://cplusplus.github.io/CWG/issues/401.html">401</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD1</td>
     <td>When is access for template parameter default arguments checked?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="402">
     <td><a href="https://cplusplus.github.io/CWG/issues/402.html">402</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>More on partial ordering of function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="403">
     <td><a href="https://cplusplus.github.io/CWG/issues/403.html">403</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Reference to a type as a <I>template-id</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="404">
     <td><a href="https://cplusplus.github.io/CWG/issues/404.html">404</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Unclear reference to construction with non-trivial constructor</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="405">
     <td><a href="https://cplusplus.github.io/CWG/issues/405.html">405</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Unqualified function name lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="406">
     <td><a href="https://cplusplus.github.io/CWG/issues/406.html">406</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD1</td>
     <td>Static data member in class with name for linkage purposes</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="407">
     <td><a href="https://cplusplus.github.io/CWG/issues/407.html">407</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>C++11</td>
     <td>Named class with associated typedef: two names or one?</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="408">
     <td><a href="https://cplusplus.github.io/CWG/issues/408.html">408</a></td>
+    <td>[<a href="https://wg21.link/temp.static">temp.static</a>]</td>
     <td>CD2</td>
     <td>sizeof applied to unknown-bound array static data member of template</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="409">
     <td><a href="https://cplusplus.github.io/CWG/issues/409.html">409</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Obsolete paragraph missed by changes for issue 224</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="410">
     <td><a href="https://cplusplus.github.io/CWG/issues/410.html">410</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD1</td>
     <td>Paragraph missed in changes for issue 166</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="411">
     <td><a href="https://cplusplus.github.io/CWG/issues/411.html">411</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD6</td>
     <td>Use of universal-character-name in character versus string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="412">
     <td><a href="https://cplusplus.github.io/CWG/issues/412.html">412</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>NAD</td>
     <td>Can a replacement allocation function be inline?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="413">
     <td><a href="https://cplusplus.github.io/CWG/issues/413.html">413</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Definition of "empty class"</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="414">
     <td><a href="https://cplusplus.github.io/CWG/issues/414.html">414</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD1</td>
     <td>Multiple types found on destructor lookup</td>
     <td class="none" align="center">Duplicate of <a href="#305">305</a></td>
   </tr>
   <tr id="415">
     <td><a href="https://cplusplus.github.io/CWG/issues/415.html">415</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>CD1</td>
     <td>Template deduction does not cause instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="416">
     <td><a href="https://cplusplus.github.io/CWG/issues/416.html">416</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD1</td>
     <td>Class must be complete to allow operator lookup?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="417">
     <td><a href="https://cplusplus.github.io/CWG/issues/417.html">417</a></td>
+    <td>[<a href="https://wg21.link/class.name">class.name</a>]</td>
     <td>CD1</td>
     <td>Using derived-class qualified name in out-of-class nested class definition</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="418">
     <td><a href="https://cplusplus.github.io/CWG/issues/418.html">418</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>CD6</td>
     <td>Imperfect wording on error on multiple default arguments on a called function</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="419">
     <td><a href="https://cplusplus.github.io/CWG/issues/419.html">419</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Can cast to virtual base class be done on partially-constructed object?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="420">
     <td><a href="https://cplusplus.github.io/CWG/issues/420.html">420</a></td>
+    <td>[<a href="https://wg21.link/over.ref">over.ref</a>]</td>
     <td>CD1</td>
     <td>postfixexpression-&gt;scalar_type_dtor() inconsistent</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="421">
     <td><a href="https://cplusplus.github.io/CWG/issues/421.html">421</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD1</td>
     <td>Is rvalue.field an rvalue?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="422">
     <td><a href="https://cplusplus.github.io/CWG/issues/422.html">422</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>NAD</td>
     <td>Is a typedef redeclaration allowed with a template type that might be the same?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="423">
     <td><a href="https://cplusplus.github.io/CWG/issues/423.html">423</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>NAD</td>
     <td>Can a conversion be done on the left operand of a compound assignment?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="424">
     <td><a href="https://cplusplus.github.io/CWG/issues/424.html">424</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD1</td>
     <td>Wording problem with issue 56 resolution on redeclaring typedefs in class scope</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="425">
     <td><a href="https://cplusplus.github.io/CWG/issues/425.html">425</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD1</td>
     <td>Set of candidates for overloaded built-in operator with float operand</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="426">
     <td><a href="https://cplusplus.github.io/CWG/issues/426.html">426</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++17</td>
     <td>Identically-named variables, one internally and one externally linked, allowed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="427">
     <td><a href="https://cplusplus.github.io/CWG/issues/427.html">427</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td><TT>static_cast</TT> ambiguity: conversion versus cast to derived</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="428">
     <td><a href="https://cplusplus.github.io/CWG/issues/428.html">428</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Mention of expression with reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="429">
     <td><a href="https://cplusplus.github.io/CWG/issues/429.html">429</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Matching deallocation function chosen based on syntax or signature?</td>
     <td class="full" align="center">Clang 2.8 (C++11 onwards)</td>
   </tr>
   <tr id="430">
     <td><a href="https://cplusplus.github.io/CWG/issues/430.html">430</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Ordering of expression evaluation in initializer list</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="431">
     <td><a href="https://cplusplus.github.io/CWG/issues/431.html">431</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++11</td>
     <td>Defect in wording in 14.2</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="432">
     <td><a href="https://cplusplus.github.io/CWG/issues/432.html">432</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD1</td>
     <td>Is injected class name visible in base class specifier list?</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="433">
     <td><a href="https://cplusplus.github.io/CWG/issues/433.html">433</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>CD1</td>
     <td>Do elaborated type specifiers in templates inject into enclosing namespace scope?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="434">
     <td><a href="https://cplusplus.github.io/CWG/issues/434.html">434</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Unclear suppression of standard conversions while binding reference to lvalue</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2352">2352</a></td>
   </tr>
   <tr id="435">
     <td><a href="https://cplusplus.github.io/CWG/issues/435.html">435</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Change "declararation or definition" to "declaration"</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="436">
     <td><a href="https://cplusplus.github.io/CWG/issues/436.html">436</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD1</td>
     <td>Problem in example in 9.6 paragraph 4</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="437">
     <td><a href="https://cplusplus.github.io/CWG/issues/437.html">437</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Is type of class allowed in member function exception specification?</td>
     <td class="full-superseded-superseded" align="center">Superseded by <a href="#1308">1308</a></td>
   </tr>
   <tr id="438">
     <td><a href="https://cplusplus.github.io/CWG/issues/438.html">438</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Possible flaw in wording for multiple accesses to object between sequence points</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="439">
     <td><a href="https://cplusplus.github.io/CWG/issues/439.html">439</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Guarantees on casting pointer back to cv-qualified version of original type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="440">
     <td><a href="https://cplusplus.github.io/CWG/issues/440.html">440</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>NAD</td>
     <td>Allow implicit pointer-to-member conversion on nontype template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="441">
     <td><a href="https://cplusplus.github.io/CWG/issues/441.html">441</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Ordering of static reference initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="442">
     <td><a href="https://cplusplus.github.io/CWG/issues/442.html">442</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD1</td>
     <td>Incorrect use of null pointer constant in description of delete operator</td>
     <td class="na-superseded" align="center">Superseded by <a href="#348">348</a></td>
   </tr>
   <tr id="443">
     <td><a href="https://cplusplus.github.io/CWG/issues/443.html">443</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Wording nit in description of lifetime of temporaries</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="444">
     <td><a href="https://cplusplus.github.io/CWG/issues/444.html">444</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Overriding and the generated copy assignment operator</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="445">
     <td><a href="https://cplusplus.github.io/CWG/issues/445.html">445</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Wording issue on friend declarations</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="446">
     <td><a href="https://cplusplus.github.io/CWG/issues/446.html">446</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD1</td>
     <td>Does an lvalue-to-rvalue conversion on the "?" operator produce a temporary?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="447">
     <td><a href="https://cplusplus.github.io/CWG/issues/447.html">447</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD1</td>
     <td>Is offsetof type-dependent?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="448">
     <td><a href="https://cplusplus.github.io/CWG/issues/448.html">448</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Set of template functions in call with dependent explicit argument</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="449">
     <td><a href="https://cplusplus.github.io/CWG/issues/449.html">449</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>NAD</td>
     <td>Consistency in use of hyphen with names of "non" entities</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="450">
     <td><a href="https://cplusplus.github.io/CWG/issues/450.html">450</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD1</td>
     <td>Binding a reference to const to a cv-qualified array rvalue</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="451">
     <td><a href="https://cplusplus.github.io/CWG/issues/451.html">451</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD1</td>
     <td>Expressions with invalid results and ill-formedness</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="452">
     <td><a href="https://cplusplus.github.io/CWG/issues/452.html">452</a></td>
+    <td>[<a href="https://wg21.link/class.this">class.this</a>]</td>
     <td>CD1</td>
     <td>Wording nit on description of <TT>this</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="453">
     <td><a href="https://cplusplus.github.io/CWG/issues/453.html">453</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD7</td>
     <td>References may only bind to &#8220;valid&#8221; objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="454">
     <td><a href="https://cplusplus.github.io/CWG/issues/454.html">454</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD1</td>
     <td>When is a definition of a static data member required?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="455">
     <td><a href="https://cplusplus.github.io/CWG/issues/455.html">455</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>NAD</td>
     <td>Partial ordering and non-deduced arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="456">
     <td><a href="https://cplusplus.github.io/CWG/issues/456.html">456</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>NAD</td>
     <td>Is initialized const int or const bool variable a null pointer constant?</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="457">
     <td><a href="https://cplusplus.github.io/CWG/issues/457.html">457</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Wording nit on use of const variables in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="458">
     <td><a href="https://cplusplus.github.io/CWG/issues/458.html">458</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Hiding of member template parameters by other members</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="459">
     <td><a href="https://cplusplus.github.io/CWG/issues/459.html">459</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>NAD</td>
     <td>Hiding of template parameters by base class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="460">
     <td><a href="https://cplusplus.github.io/CWG/issues/460.html">460</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD1</td>
     <td>Can a <I>using-declaration</I> name a namespace?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="461">
     <td><a href="https://cplusplus.github.io/CWG/issues/461.html">461</a></td>
+    <td>[<a href="https://wg21.link/dcl.asm">dcl.asm</a>]</td>
     <td>NAD</td>
     <td>Make <TT>asm</TT> conditionally-supported</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="462">
     <td><a href="https://cplusplus.github.io/CWG/issues/462.html">462</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD3</td>
     <td>Lifetime of temporaries bound to comma expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="463">
     <td><a href="https://cplusplus.github.io/CWG/issues/463.html">463</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD1</td>
     <td><TT>reinterpret_cast&lt;T*&gt;(0)</TT></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="464">
     <td><a href="https://cplusplus.github.io/CWG/issues/464.html">464</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD1</td>
     <td>Wording nit on lifetime of temporaries to which references are bound</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="465">
     <td><a href="https://cplusplus.github.io/CWG/issues/465.html">465</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
     <td>May constructors of global objects call <TT>exit()</TT>?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="466">
     <td><a href="https://cplusplus.github.io/CWG/issues/466.html">466</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>CD1</td>
     <td>cv-qualifiers on pseudo-destructor type</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="467">
     <td><a href="https://cplusplus.github.io/CWG/issues/467.html">467</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>NAD</td>
     <td>Jump past initialization of local static variable</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="468">
     <td><a href="https://cplusplus.github.io/CWG/issues/468.html">468</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD1</td>
     <td>Allow <TT>::template</TT> outside of templates</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="469">
     <td><a href="https://cplusplus.github.io/CWG/issues/469.html">469</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Const template specializations and reference arguments</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="470">
     <td><a href="https://cplusplus.github.io/CWG/issues/470.html">470</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>Instantiation of members of an explicitly-instantiated class template</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="471">
     <td><a href="https://cplusplus.github.io/CWG/issues/471.html">471</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Conflicting inherited access specifications</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr class="open" id="472">
     <td><a href="https://cplusplus.github.io/CWG/issues/472.html">472</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>open</td>
     <td>Casting across protected inheritance</td>
     <td align="center">
@@ -2887,5528 +3358,6447 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="473">
     <td><a href="https://cplusplus.github.io/CWG/issues/473.html">473</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Block-scope declarations of allocator functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="474">
     <td><a href="https://cplusplus.github.io/CWG/issues/474.html">474</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD1</td>
     <td>Block-scope <TT>extern</TT> declarations in namespace members</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="475">
     <td><a href="https://cplusplus.github.io/CWG/issues/475.html">475</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>C++11</td>
     <td>When is <TT>std::uncaught_exception()</TT> true? (take 2)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="476">
     <td><a href="https://cplusplus.github.io/CWG/issues/476.html">476</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Determining the buffer size for placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="477">
     <td><a href="https://cplusplus.github.io/CWG/issues/477.html">477</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD1</td>
     <td>Can <TT>virtual</TT> appear in a <TT>friend</TT> declaration?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="478">
     <td><a href="https://cplusplus.github.io/CWG/issues/478.html">478</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>May a function parameter be an array of an abstract class type?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="479">
     <td><a href="https://cplusplus.github.io/CWG/issues/479.html">479</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD1</td>
     <td>Copy elision in exception handling</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="480">
     <td><a href="https://cplusplus.github.io/CWG/issues/480.html">480</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>CD1</td>
     <td>Is a base of a virtual base also virtual?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="481">
     <td><a href="https://cplusplus.github.io/CWG/issues/481.html">481</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>CD2</td>
     <td>Scope of template parameters</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="482">
     <td><a href="https://cplusplus.github.io/CWG/issues/482.html">482</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD3</td>
     <td>Qualified declarators in redeclarations</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="483">
     <td><a href="https://cplusplus.github.io/CWG/issues/483.html">483</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Normative requirements on integral ranges</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="484">
     <td><a href="https://cplusplus.github.io/CWG/issues/484.html">484</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>CD1</td>
     <td>Can a <I>base-specifier</I> name a cv-qualified class type?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="485">
     <td><a href="https://cplusplus.github.io/CWG/issues/485.html">485</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD1</td>
     <td>What is a &#8220;name&#8221;?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="486">
     <td><a href="https://cplusplus.github.io/CWG/issues/486.html">486</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Invalid return types and template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="487">
     <td><a href="https://cplusplus.github.io/CWG/issues/487.html">487</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Operator overloading in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="488">
     <td><a href="https://cplusplus.github.io/CWG/issues/488.html">488</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD1</td>
     <td>Local types, overload resolution, and template argument deduction</td>
     <td class="full" align="center">Clang 2.9 (C++11 onwards)</td>
   </tr>
   <tr id="489">
     <td><a href="https://cplusplus.github.io/CWG/issues/489.html">489</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Must member function templates be instantiated during overload resolution?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="490">
     <td><a href="https://cplusplus.github.io/CWG/issues/490.html">490</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD2</td>
     <td>Name lookup in friend declarations</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="491">
     <td><a href="https://cplusplus.github.io/CWG/issues/491.html">491</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Initializers for empty-class aggregrate members</td>
     <td class="full" align="center">Duplicate of <a href="#413">413</a></td>
   </tr>
   <tr id="492">
     <td><a href="https://cplusplus.github.io/CWG/issues/492.html">492</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD1</td>
     <td><TT>typeid</TT> constness inconsistent with example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="493">
     <td><a href="https://cplusplus.github.io/CWG/issues/493.html">493</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Type deduction from a <TT>bool</TT> context</td>
     <td class="unknown" align="center">Duplicate of <a href="#976">976</a></td>
   </tr>
   <tr id="494">
     <td><a href="https://cplusplus.github.io/CWG/issues/494.html">494</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD1</td>
     <td>Problems with the resolution of issue 45</td>
     <td class="none" align="center">Duplicate of <a href="#372">372</a></td>
   </tr>
   <tr id="495">
     <td><a href="https://cplusplus.github.io/CWG/issues/495.html">495</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>CD2</td>
     <td>Overload resolution with template and non-template conversion functions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="496">
     <td><a href="https://cplusplus.github.io/CWG/issues/496.html">496</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Is a volatile-qualified type really a POD?</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2094">2094</a></td>
   </tr>
   <tr id="497">
     <td><a href="https://cplusplus.github.io/CWG/issues/497.html">497</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>CD1</td>
     <td>Missing required initialization in example</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#253">253</a></td>
   </tr>
   <tr class="open" id="498">
     <td><a href="https://cplusplus.github.io/CWG/issues/498.html">498</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>open</td>
     <td>Storage class specifiers in definitions of class members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="499">
     <td><a href="https://cplusplus.github.io/CWG/issues/499.html">499</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD2</td>
     <td>Throwing an array of unknown size</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="500">
     <td><a href="https://cplusplus.github.io/CWG/issues/500.html">500</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD1</td>
     <td>Access in <I>base-specifier</I>s of friend and nested classes</td>
     <td class="none" align="center">Duplicate of <a href="#372">372</a></td>
   </tr>
   <tr id="501">
     <td><a href="https://cplusplus.github.io/CWG/issues/501.html">501</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Visibility of friend declarations within the befriending class</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="502">
     <td><a href="https://cplusplus.github.io/CWG/issues/502.html">502</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Dependency of nested enumerations and enumerators</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="503">
     <td><a href="https://cplusplus.github.io/CWG/issues/503.html">503</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Cv-qualified function types in template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="504">
     <td><a href="https://cplusplus.github.io/CWG/issues/504.html">504</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>NAD</td>
     <td>Should use of a variable in its own initializer require a diagnostic?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="505">
     <td><a href="https://cplusplus.github.io/CWG/issues/505.html">505</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for unknown character escapes</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="506">
     <td><a href="https://cplusplus.github.io/CWG/issues/506.html">506</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="507">
     <td><a href="https://cplusplus.github.io/CWG/issues/507.html">507</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>dup</td>
     <td>Ambiguity assigning class object to built-in type</td>
     <td class="unknown" align="center">Duplicate of <a href="#260">260</a></td>
   </tr>
   <tr id="508">
     <td><a href="https://cplusplus.github.io/CWG/issues/508.html">508</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Non-constructed value-initialized objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="509">
     <td><a href="https://cplusplus.github.io/CWG/issues/509.html">509</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Dead code in the specification of default initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="510">
     <td><a href="https://cplusplus.github.io/CWG/issues/510.html">510</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>CD1</td>
     <td>Default initialization of POD classes?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="511">
     <td><a href="https://cplusplus.github.io/CWG/issues/511.html">511</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>NAD</td>
     <td>POD-structs with template assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="512">
     <td><a href="https://cplusplus.github.io/CWG/issues/512.html">512</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>NAD</td>
     <td>Union members with user-declared non-default constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="513">
     <td><a href="https://cplusplus.github.io/CWG/issues/513.html">513</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD1</td>
     <td>Non-class &#8220;most-derived&#8221; objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="514">
     <td><a href="https://cplusplus.github.io/CWG/issues/514.html">514</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD1</td>
     <td>Is the initializer for a namespace member in the scope of the namespace?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="515">
     <td><a href="https://cplusplus.github.io/CWG/issues/515.html">515</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD1</td>
     <td>Non-dependent references to base class members</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1017">1017</a></td>
   </tr>
   <tr id="516">
     <td><a href="https://cplusplus.github.io/CWG/issues/516.html">516</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Use of <TT>signed</TT> in bit-field declarations</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="517">
     <td><a href="https://cplusplus.github.io/CWG/issues/517.html">517</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD1</td>
     <td>Partial specialization following explicit instantiation</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="518">
     <td><a href="https://cplusplus.github.io/CWG/issues/518.html">518</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Trailing comma following <I>enumerator-list</I></td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="519">
     <td><a href="https://cplusplus.github.io/CWG/issues/519.html">519</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD1</td>
     <td>Null pointer preservation in <TT>void*</TT> conversions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="520">
     <td><a href="https://cplusplus.github.io/CWG/issues/520.html">520</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD1</td>
     <td>Old-style casts between incomplete class types</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="521">
     <td><a href="https://cplusplus.github.io/CWG/issues/521.html">521</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD1</td>
     <td>Requirements for exceptions thrown by allocation functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="522">
     <td><a href="https://cplusplus.github.io/CWG/issues/522.html">522</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Array-to-pointer decay in template argument deduction</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="523">
     <td><a href="https://cplusplus.github.io/CWG/issues/523.html">523</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>open</td>
     <td>Can a one-past-the-end pointer be invalidated by deleting an adjacent object?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="524">
     <td><a href="https://cplusplus.github.io/CWG/issues/524.html">524</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD1</td>
     <td>Can function-notation calls to operator functions be dependent?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="525">
     <td><a href="https://cplusplus.github.io/CWG/issues/525.html">525</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD1</td>
     <td>Missing <TT>*</TT> in example</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="526">
     <td><a href="https://cplusplus.github.io/CWG/issues/526.html">526</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD1</td>
     <td>Confusing aspects in the specification of non-deduced contexts</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="527">
     <td><a href="https://cplusplus.github.io/CWG/issues/527.html">527</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Problems with linkage of types</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="528">
     <td><a href="https://cplusplus.github.io/CWG/issues/528.html">528</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>NAD</td>
     <td>Why are incomplete class types not allowed with <TT>typeid</TT>?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="529">
     <td><a href="https://cplusplus.github.io/CWG/issues/529.html">529</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>open</td>
     <td>Use of <TT>template&lt;&gt;</TT> with &#8220;explicitly-specialized&#8221; class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="530">
     <td><a href="https://cplusplus.github.io/CWG/issues/530.html">530</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Nontype template arguments in constant expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="531">
     <td><a href="https://cplusplus.github.io/CWG/issues/531.html">531</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Defining members of explicit specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="532">
     <td><a href="https://cplusplus.github.io/CWG/issues/532.html">532</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>Member/nonmember operator template partial ordering</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="533">
     <td><a href="https://cplusplus.github.io/CWG/issues/533.html">533</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>NAD</td>
     <td>Special treatment for C-style header names</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="534">
     <td><a href="https://cplusplus.github.io/CWG/issues/534.html">534</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD1</td>
     <td><I>template-name</I>s and <I>operator-function-id</I>s</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="535">
     <td><a href="https://cplusplus.github.io/CWG/issues/535.html">535</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Copy construction without a copy constructor</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="536">
     <td><a href="https://cplusplus.github.io/CWG/issues/536.html">536</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD6</td>
     <td>Problems in the description of <I>id-expression</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="537">
     <td><a href="https://cplusplus.github.io/CWG/issues/537.html">537</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD1</td>
     <td>Definition of &#8220;signature&#8221;</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="538">
     <td><a href="https://cplusplus.github.io/CWG/issues/538.html">538</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
-    <td>Definition and usage
-of <I>structure</I>, <I>POD-struct</I>, <I>POD-union</I>,
-and <I>POD class</I></td>
+    <td>Definition and usage of <I>structure</I>, <I>POD-struct</I>, <I>POD-union</I>, and <I>POD class</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="539">
     <td><a href="https://cplusplus.github.io/CWG/issues/539.html">539</a></td>
+    <td>[<a href="https://wg21.link/dcl.type">dcl.type</a>]</td>
     <td>CD3</td>
     <td>Constraints on <I>type-specifier-seq</I></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="540">
     <td><a href="https://cplusplus.github.io/CWG/issues/540.html">540</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD1</td>
     <td>Propagation of cv-qualifiers in reference-to-reference collapse</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="541">
     <td><a href="https://cplusplus.github.io/CWG/issues/541.html">541</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD2</td>
     <td>Dependent function types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="542">
     <td><a href="https://cplusplus.github.io/CWG/issues/542.html">542</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>CD2</td>
     <td>Value initialization of arrays of POD-structs</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="543">
     <td><a href="https://cplusplus.github.io/CWG/issues/543.html">543</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD1</td>
     <td>Value initialization and default constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="544">
     <td><a href="https://cplusplus.github.io/CWG/issues/544.html">544</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>NAD</td>
     <td>Base class lookup in explicit specialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="545">
     <td><a href="https://cplusplus.github.io/CWG/issues/545.html">545</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>User-defined conversions and built-in operator overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="546">
     <td><a href="https://cplusplus.github.io/CWG/issues/546.html">546</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>C++11</td>
     <td>Explicit instantiation of class template members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="547">
     <td><a href="https://cplusplus.github.io/CWG/issues/547.html">547</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Partial specialization on member function types</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="548">
     <td><a href="https://cplusplus.github.io/CWG/issues/548.html">548</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>dup</td>
     <td><I>qualified-id</I>s in declarations</td>
     <td class="full" align="center">Duplicate of <a href="#482">482</a></td>
   </tr>
   <tr class="open" id="549">
     <td><a href="https://cplusplus.github.io/CWG/issues/549.html">549</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.match">temp.spec.partial.match</a>]</td>
     <td>drafting</td>
     <td>Non-deducible parameters in partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="550">
     <td><a href="https://cplusplus.github.io/CWG/issues/550.html">550</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>dup</td>
     <td>Pointer to array of unknown bound in parameter declarations</td>
     <td class="full" align="center">Duplicate of <a href="#393">393</a></td>
   </tr>
   <tr id="551">
     <td><a href="https://cplusplus.github.io/CWG/issues/551.html">551</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD1</td>
     <td>When is <TT>inline</TT> permitted in an explicit instantiation?</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="552">
     <td><a href="https://cplusplus.github.io/CWG/issues/552.html">552</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>typename</TT> in the type in a non-type <I>parameter-declaration</I></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="553">
     <td><a href="https://cplusplus.github.io/CWG/issues/553.html">553</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Problems with friend allocation and deallocation functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="554">
     <td><a href="https://cplusplus.github.io/CWG/issues/554.html">554</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>CD6</td>
     <td>Definition of &#8220;declarative region&#8221; and &#8220;scope&#8221;</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="555">
     <td><a href="https://cplusplus.github.io/CWG/issues/555.html">555</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup">basic.lookup</a>]</td>
     <td>CD5</td>
     <td>Pseudo-destructor name lookup</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="556">
     <td><a href="https://cplusplus.github.io/CWG/issues/556.html">556</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD2</td>
     <td>Conflicting requirements for acceptable aliasing</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="557">
     <td><a href="https://cplusplus.github.io/CWG/issues/557.html">557</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD1</td>
     <td>Does argument-dependent lookup cause template instantiation?</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="558">
     <td><a href="https://cplusplus.github.io/CWG/issues/558.html">558</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD1</td>
     <td>Excluded characters in universal character names</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="559">
     <td><a href="https://cplusplus.github.io/CWG/issues/559.html">559</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Editing error in issue 382 resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="560">
     <td><a href="https://cplusplus.github.io/CWG/issues/560.html">560</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Use of the <TT>typename</TT> keyword in return types</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="561">
     <td><a href="https://cplusplus.github.io/CWG/issues/561.html">561</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD2</td>
     <td>Internal linkage functions in dependent name lookup</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="562">
     <td><a href="https://cplusplus.github.io/CWG/issues/562.html">562</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD6</td>
     <td><I>qualified-id</I>s in non-expression contexts</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="563">
     <td><a href="https://cplusplus.github.io/CWG/issues/563.html">563</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>Linkage specification for objects</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="564">
     <td><a href="https://cplusplus.github.io/CWG/issues/564.html">564</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD2</td>
     <td>Agreement of language linkage or <I>linkage-specification</I>s?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="565">
     <td><a href="https://cplusplus.github.io/CWG/issues/565.html">565</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD3</td>
     <td>Conflict rules for <I>using-declaration</I>s naming function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="566">
     <td><a href="https://cplusplus.github.io/CWG/issues/566.html">566</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>NAD</td>
     <td>Conversion of negative floating point values to integer type</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="567">
     <td><a href="https://cplusplus.github.io/CWG/issues/567.html">567</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Can <TT>size_t</TT> and <TT>ptrdiff_t</TT> be larger than <TT>long</TT>?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="568">
     <td><a href="https://cplusplus.github.io/CWG/issues/568.html">568</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD1</td>
     <td>Definition of POD is too strict</td>
     <td class="full" align="center">Clang 3.0 (C++11 onwards)</td>
   </tr>
   <tr id="569">
     <td><a href="https://cplusplus.github.io/CWG/issues/569.html">569</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD2</td>
     <td>Spurious semicolons at namespace scope should be allowed</td>
     <td class="full" align="center">Clang 2.7 (C++11 onwards)</td>
   </tr>
   <tr id="570">
     <td><a href="https://cplusplus.github.io/CWG/issues/570.html">570</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD2</td>
     <td>Are references subject to the ODR?</td>
     <td class="na" align="center">Duplicate of <a href="#633">633</a></td>
   </tr>
   <tr id="571">
     <td><a href="https://cplusplus.github.io/CWG/issues/571.html">571</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>References declared <TT>const</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="572">
     <td><a href="https://cplusplus.github.io/CWG/issues/572.html">572</a></td>
+    <td>[<a href="https://wg21.link/conv">conv</a>]</td>
     <td>C++11</td>
     <td>Standard conversions for non-built-in types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="573">
     <td><a href="https://cplusplus.github.io/CWG/issues/573.html">573</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>C++11</td>
     <td>Conversions between function pointers and <TT>void*</TT></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="574">
     <td><a href="https://cplusplus.github.io/CWG/issues/574.html">574</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Definition of &#8220;copy assignment operator&#8221;</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="575">
     <td><a href="https://cplusplus.github.io/CWG/issues/575.html">575</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Criteria for deduction failure</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="576">
     <td><a href="https://cplusplus.github.io/CWG/issues/576.html">576</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD2</td>
     <td>Typedefs in function definitions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="577">
     <td><a href="https://cplusplus.github.io/CWG/issues/577.html">577</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td><TT>void</TT> in an empty parameter list</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="578">
     <td><a href="https://cplusplus.github.io/CWG/issues/578.html">578</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD6</td>
     <td>Phase 1 replacement of characters with <I>universal-character-name</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="579">
     <td><a href="https://cplusplus.github.io/CWG/issues/579.html">579</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>What is a &#8220;nested&#8221; <TT>&gt;</TT> or <TT>&gt;&gt;</TT>?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="580">
     <td><a href="https://cplusplus.github.io/CWG/issues/580.html">580</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>C++11</td>
     <td>Access in <I>template-parameter</I>s of member and friend definitions</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="581">
     <td><a href="https://cplusplus.github.io/CWG/issues/581.html">581</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD5</td>
     <td>Can a templated constructor be explicitly instantiated or specialized?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="582">
     <td><a href="https://cplusplus.github.io/CWG/issues/582.html">582</a></td>
+    <td>[<a href="https://wg21.link/temp.mem">temp.mem</a>]</td>
     <td>CD1</td>
     <td>Template conversion functions</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="583">
     <td><a href="https://cplusplus.github.io/CWG/issues/583.html">583</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD3</td>
     <td>Relational pointer comparisons against the null pointer constant</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="584">
     <td><a href="https://cplusplus.github.io/CWG/issues/584.html">584</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Unions and aliasing</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="585">
     <td><a href="https://cplusplus.github.io/CWG/issues/585.html">585</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Friend template template parameters</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="586">
     <td><a href="https://cplusplus.github.io/CWG/issues/586.html">586</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Default <I>template-argument</I>s and template argument deduction</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="587">
     <td><a href="https://cplusplus.github.io/CWG/issues/587.html">587</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD2</td>
     <td>Lvalue operands of a conditional expression differing only in cv-qualification</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="588">
     <td><a href="https://cplusplus.github.io/CWG/issues/588.html">588</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD2</td>
     <td>Searching dependent bases of classes local to function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="589">
     <td><a href="https://cplusplus.github.io/CWG/issues/589.html">589</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding of class and array rvalues in reference initialization</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="590">
     <td><a href="https://cplusplus.github.io/CWG/issues/590.html">590</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Nested classes and the &#8220;current instantiation&#8221;</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="591">
     <td><a href="https://cplusplus.github.io/CWG/issues/591.html">591</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>When a dependent base class is the current instantiation</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="592">
     <td><a href="https://cplusplus.github.io/CWG/issues/592.html">592</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD1</td>
     <td>Exceptions during construction of local static objects</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="593">
     <td><a href="https://cplusplus.github.io/CWG/issues/593.html">593</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>NAD</td>
     <td>Falling off the end of a destructor's <I>function-try-block</I> handler</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="594">
     <td><a href="https://cplusplus.github.io/CWG/issues/594.html">594</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD1</td>
     <td>Coordinating issues 119 and 404 with delegating constructors</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="595">
     <td><a href="https://cplusplus.github.io/CWG/issues/595.html">595</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications in templates instantiated from class bodies</td>
     <td class="full" align="center">Duplicate of <a href="#1330">1330</a></td>
   </tr>
   <tr id="596">
     <td><a href="https://cplusplus.github.io/CWG/issues/596.html">596</a></td>
+    <td>[<a href="https://wg21.link/except.unexpected">except.unexpected</a>]</td>
     <td>NAD</td>
     <td>Replacing an exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="597">
     <td><a href="https://cplusplus.github.io/CWG/issues/597.html">597</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD3</td>
     <td>Conversions applied to out-of-lifetime non-POD lvalues</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="598">
     <td><a href="https://cplusplus.github.io/CWG/issues/598.html">598</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD2</td>
     <td>Associated namespaces of overloaded functions and function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="599">
     <td><a href="https://cplusplus.github.io/CWG/issues/599.html">599</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD2</td>
     <td>Deleting a null function pointer</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="600">
     <td><a href="https://cplusplus.github.io/CWG/issues/600.html">600</a></td>
+    <td>[<a href="https://wg21.link/class.access">class.access</a>]</td>
     <td>CD6</td>
     <td>Does access control apply to members or to names?</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="601">
     <td><a href="https://cplusplus.github.io/CWG/issues/601.html">601</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD2</td>
     <td>Type of literals in preprocessing expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="602">
     <td><a href="https://cplusplus.github.io/CWG/issues/602.html">602</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>When is the injected-class-name of a class template a template?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="603">
     <td><a href="https://cplusplus.github.io/CWG/issues/603.html">603</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD1</td>
     <td>Type equivalence and unsigned overflow</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="604">
     <td><a href="https://cplusplus.github.io/CWG/issues/604.html">604</a></td>
+    <td>[<a href="https://wg21.link/over.match.ctor">over.match.ctor</a>]</td>
     <td>CD2</td>
     <td>Argument list for overload resolution in copy-initialization</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="605">
     <td><a href="https://cplusplus.github.io/CWG/issues/605.html">605</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Linkage of explicit specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="606">
     <td><a href="https://cplusplus.github.io/CWG/issues/606.html">606</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD1</td>
     <td>Template argument deduction for rvalue references</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="607">
     <td><a href="https://cplusplus.github.io/CWG/issues/607.html">607</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD6</td>
     <td>Lookup of <I>mem-initializer-id</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="608">
     <td><a href="https://cplusplus.github.io/CWG/issues/608.html">608</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Determining the final overrider of a virtual function</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="609">
     <td><a href="https://cplusplus.github.io/CWG/issues/609.html">609</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>CD4</td>
     <td>What is a &#8220;top-level&#8221; cv-qualifier?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="610">
     <td><a href="https://cplusplus.github.io/CWG/issues/610.html">610</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Computing the negative of <TT>0U</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="611">
     <td><a href="https://cplusplus.github.io/CWG/issues/611.html">611</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD2</td>
     <td>Zero-initializing references</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="612">
     <td><a href="https://cplusplus.github.io/CWG/issues/612.html">612</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD2</td>
     <td>Requirements on a conforming implementation</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="613">
     <td><a href="https://cplusplus.github.io/CWG/issues/613.html">613</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Unevaluated uses of non-static class members</td>
     <td class="full" align="center">Clang 3.1 (C++11 onwards)</td>
   </tr>
   <tr id="614">
     <td><a href="https://cplusplus.github.io/CWG/issues/614.html">614</a></td>
+    <td>[<a href="https://wg21.link/expr.mul">expr.mul</a>]</td>
     <td>CD1</td>
     <td>Results of integer <TT>/</TT> and <TT>%</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="615">
     <td><a href="https://cplusplus.github.io/CWG/issues/615.html">615</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of variables that can be initialized</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="616">
     <td><a href="https://cplusplus.github.io/CWG/issues/616.html">616</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;indeterminate value&#8221;</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="617">
     <td><a href="https://cplusplus.github.io/CWG/issues/617.html">617</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>NAD</td>
     <td>Lvalue-to-rvalue conversions of uninitialized <TT>char</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="618">
     <td><a href="https://cplusplus.github.io/CWG/issues/618.html">618</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD2</td>
     <td>Casts in preprocessor conditional expressions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="619">
     <td><a href="https://cplusplus.github.io/CWG/issues/619.html">619</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Completeness of array types</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="620">
     <td><a href="https://cplusplus.github.io/CWG/issues/620.html">620</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD1</td>
     <td>Declaration order in layout-compatible POD structs</td>
     <td class="full" align="center">Duplicate of <a href="#568">568</a></td>
   </tr>
   <tr id="621">
     <td><a href="https://cplusplus.github.io/CWG/issues/621.html">621</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Template argument deduction from function return types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="622">
     <td><a href="https://cplusplus.github.io/CWG/issues/622.html">622</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>NAD</td>
     <td>Relational comparisons of arbitrary pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="623">
     <td><a href="https://cplusplus.github.io/CWG/issues/623.html">623</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>CD3</td>
     <td>Use of pointers to deallocated storage</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="624">
     <td><a href="https://cplusplus.github.io/CWG/issues/624.html">624</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD1</td>
     <td>Overflow in calculating size of allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="625">
     <td><a href="https://cplusplus.github.io/CWG/issues/625.html">625</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>auto</TT> as a <I>template-argument</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="626">
     <td><a href="https://cplusplus.github.io/CWG/issues/626.html">626</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD2</td>
     <td>Preprocessor string literals</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="627">
     <td><a href="https://cplusplus.github.io/CWG/issues/627.html">627</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Values behaving as types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="628">
     <td><a href="https://cplusplus.github.io/CWG/issues/628.html">628</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD2</td>
     <td>The values of an enumeration with no enumerator</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="629">
     <td><a href="https://cplusplus.github.io/CWG/issues/629.html">629</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD1</td>
     <td><TT>auto</TT> parsing ambiguity</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="630">
     <td><a href="https://cplusplus.github.io/CWG/issues/630.html">630</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD2</td>
     <td>Equality of narrow and wide character values in the basic character set</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="631">
     <td><a href="https://cplusplus.github.io/CWG/issues/631.html">631</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>CD3</td>
     <td>Jumping into a &#8220;then&#8221; clause</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="632">
     <td><a href="https://cplusplus.github.io/CWG/issues/632.html">632</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD1</td>
     <td>Brace-enclosed initializer for scalar member of aggregate</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="633">
     <td><a href="https://cplusplus.github.io/CWG/issues/633.html">633</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Specifications for variables that should also apply to references</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="634">
     <td><a href="https://cplusplus.github.io/CWG/issues/634.html">634</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD1</td>
     <td>Conditionally-supported behavior for non-POD objects passed to ellipsis redux</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="635">
     <td><a href="https://cplusplus.github.io/CWG/issues/635.html">635</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>NAD</td>
     <td>Names of constructors and destructors of templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="636">
     <td><a href="https://cplusplus.github.io/CWG/issues/636.html">636</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD4</td>
     <td>Dynamic type of objects and aliasing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="637">
     <td><a href="https://cplusplus.github.io/CWG/issues/637.html">637</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD1</td>
     <td>Sequencing rules and example disagree</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="638">
     <td><a href="https://cplusplus.github.io/CWG/issues/638.html">638</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD2</td>
     <td>Explicit specialization and friendship</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="639">
     <td><a href="https://cplusplus.github.io/CWG/issues/639.html">639</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD1</td>
     <td>What makes side effects &#8220;different&#8221; from one another?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="640">
     <td><a href="https://cplusplus.github.io/CWG/issues/640.html">640</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>NAD</td>
     <td>Accessing destroyed local objects of static storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="641">
     <td><a href="https://cplusplus.github.io/CWG/issues/641.html">641</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>CD2</td>
     <td>Overload resolution and conversion-to-same-type operators</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="642">
     <td><a href="https://cplusplus.github.io/CWG/issues/642.html">642</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD2</td>
     <td>Definition and use of &#8220;block scope&#8221; and &#8220;local scope&#8221;</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="643">
     <td><a href="https://cplusplus.github.io/CWG/issues/643.html">643</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>decltype</TT> in a class <I>member-specification</I></td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="644">
     <td><a href="https://cplusplus.github.io/CWG/issues/644.html">644</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD1</td>
     <td>Should a trivial class type be a literal type?</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="645">
     <td><a href="https://cplusplus.github.io/CWG/issues/645.html">645</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD2</td>
     <td>Are bit-field and non-bit-field members layout compatible?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="646">
     <td><a href="https://cplusplus.github.io/CWG/issues/646.html">646</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Can a class with a constexpr copy constructor be a literal type?</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#981">981</a></td>
   </tr>
   <tr id="647">
     <td><a href="https://cplusplus.github.io/CWG/issues/647.html">647</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD1</td>
     <td>Non-constexpr instances of constexpr constructor templates</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="648">
     <td><a href="https://cplusplus.github.io/CWG/issues/648.html">648</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD1</td>
     <td>Constant expressions in constexpr initializers</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="649">
     <td><a href="https://cplusplus.github.io/CWG/issues/649.html">649</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD1</td>
     <td>Optionally ill-formed extended alignment requests</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="650">
     <td><a href="https://cplusplus.github.io/CWG/issues/650.html">650</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD2</td>
     <td>Order of destruction for temporaries bound to the returned value of a function</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="651">
     <td><a href="https://cplusplus.github.io/CWG/issues/651.html">651</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD1</td>
     <td>Problems in <TT>decltype</TT> specification and examples</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="652">
     <td><a href="https://cplusplus.github.io/CWG/issues/652.html">652</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Compile-time evaluation of floating-point expressions</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="653">
     <td><a href="https://cplusplus.github.io/CWG/issues/653.html">653</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD2</td>
     <td>Copy assignment of unions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="654">
     <td><a href="https://cplusplus.github.io/CWG/issues/654.html">654</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD1</td>
     <td>Conversions to and from <TT>nullptr_t</TT></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1423">1423</a></td>
   </tr>
   <tr id="655">
     <td><a href="https://cplusplus.github.io/CWG/issues/655.html">655</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Initialization not specified for forwarding constructors</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="656">
     <td><a href="https://cplusplus.github.io/CWG/issues/656.html">656</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding to the result of a conversion operator</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="657">
     <td><a href="https://cplusplus.github.io/CWG/issues/657.html">657</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD2</td>
     <td>Abstract class parameter in synthesized declaration</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="658">
     <td><a href="https://cplusplus.github.io/CWG/issues/658.html">658</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Defining <TT>reinterpret_cast</TT> for pointer types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="659">
     <td><a href="https://cplusplus.github.io/CWG/issues/659.html">659</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD1</td>
     <td>Alignment of function types</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="660">
     <td><a href="https://cplusplus.github.io/CWG/issues/660.html">660</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD1</td>
     <td>Unnamed scoped enumerations</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="661">
     <td><a href="https://cplusplus.github.io/CWG/issues/661.html">661</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD1</td>
     <td>Semantics of arithmetic comparisons</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="662">
     <td><a href="https://cplusplus.github.io/CWG/issues/662.html">662</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Forming a pointer to a reference type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="663">
     <td><a href="https://cplusplus.github.io/CWG/issues/663.html">663</a></td>
+    <td>[<a href="https://wg21.link/extendid">extendid</a>]</td>
     <td>CD1</td>
     <td>Valid Cyrillic identifier characters</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P1949">P1949</a></td>
   </tr>
   <tr id="664">
     <td><a href="https://cplusplus.github.io/CWG/issues/664.html">664</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Direct binding of references to non-class rvalue references</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="665">
     <td><a href="https://cplusplus.github.io/CWG/issues/665.html">665</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD2</td>
     <td>Problems in the specification of <TT>dynamic_cast</TT></td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="666">
     <td><a href="https://cplusplus.github.io/CWG/issues/666.html">666</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD1</td>
     <td>Dependent <I>qualified-id</I>s without the <TT>typename</TT> keyword</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="667">
     <td><a href="https://cplusplus.github.io/CWG/issues/667.html">667</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Trivial special member functions that cannot be implicitly defined</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="668">
     <td><a href="https://cplusplus.github.io/CWG/issues/668.html">668</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>CD2</td>
     <td>Throwing an exception from the destructor of a local static object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="669">
     <td><a href="https://cplusplus.github.io/CWG/issues/669.html">669</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>NAD</td>
     <td>Confusing specification of the meaning of <TT>decltype</TT></td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="670">
     <td><a href="https://cplusplus.github.io/CWG/issues/670.html">670</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Copy initialization via derived-to-base conversion in the second step</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="671">
     <td><a href="https://cplusplus.github.io/CWG/issues/671.html">671</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD1</td>
     <td>Explicit conversion from a scoped enumeration type to integral type</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="672">
     <td><a href="https://cplusplus.github.io/CWG/issues/672.html">672</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Sequencing of initialization in <I>new-expression</I>s</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="673">
     <td><a href="https://cplusplus.github.io/CWG/issues/673.html">673</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Injection of names from <I>elaborated-type-specifier</I>s in <TT>friend</TT> declarations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="674">
     <td><a href="https://cplusplus.github.io/CWG/issues/674.html">674</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>C++11</td>
     <td>&#8220;matching specialization&#8221; for a friend declaration</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="675">
     <td><a href="https://cplusplus.github.io/CWG/issues/675.html">675</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD3</td>
     <td>Signedness of bit-field with typedef or template parameter type</td>
     <td class="unknown" align="center">Duplicate of <a href="#739">739</a></td>
   </tr>
   <tr id="676">
     <td><a href="https://cplusplus.github.io/CWG/issues/676.html">676</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td><I>static_assert-declaration</I>s and general requirements for declarations</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="677">
     <td><a href="https://cplusplus.github.io/CWG/issues/677.html">677</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD1</td>
     <td>Deleted <TT>operator delete</TT> and virtual destructors</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="678">
     <td><a href="https://cplusplus.github.io/CWG/issues/678.html">678</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>Language linkage of member function parameter types and the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="679">
     <td><a href="https://cplusplus.github.io/CWG/issues/679.html">679</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD1</td>
     <td>Equivalence of <I>template-id</I>s and operator function templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="680">
     <td><a href="https://cplusplus.github.io/CWG/issues/680.html">680</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>What is a move constructor?</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="681">
     <td><a href="https://cplusplus.github.io/CWG/issues/681.html">681</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD1</td>
     <td>Restrictions on declarators with late-specified return types</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="682">
     <td><a href="https://cplusplus.github.io/CWG/issues/682.html">682</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD5</td>
     <td>Missing description of lookup of template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="683">
     <td><a href="https://cplusplus.github.io/CWG/issues/683.html">683</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD1</td>
     <td>Requirements for trivial subobject special functions</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="684">
     <td><a href="https://cplusplus.github.io/CWG/issues/684.html">684</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD1</td>
     <td>Constant expressions involving the address of an automatic variable</td>
     <td class="unknown-superseded" align="center">Superseded by <a href="#1454">1454</a></td>
   </tr>
   <tr id="685">
     <td><a href="https://cplusplus.github.io/CWG/issues/685.html">685</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD2</td>
     <td>Integral promotion of enumeration ignores fixed underlying type</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="686">
     <td><a href="https://cplusplus.github.io/CWG/issues/686.html">686</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>CD1</td>
     <td>Type declarations/definitions in <I>type-specifier-seq</I>s and <I>type-id</I>s</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="687">
     <td><a href="https://cplusplus.github.io/CWG/issues/687.html">687</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>NAD</td>
     <td><TT>template</TT> keyword with <I>unqualified-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="688">
     <td><a href="https://cplusplus.github.io/CWG/issues/688.html">688</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD1</td>
     <td>Constexpr constructors and static initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="689">
     <td><a href="https://cplusplus.github.io/CWG/issues/689.html">689</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD5</td>
     <td>Maximum values of signed and unsigned integers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="690">
     <td><a href="https://cplusplus.github.io/CWG/issues/690.html">690</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD2</td>
     <td>The dynamic type of an rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="691">
     <td><a href="https://cplusplus.github.io/CWG/issues/691.html">691</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template parameter packs in class template partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="692">
     <td><a href="https://cplusplus.github.io/CWG/issues/692.html">692</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++11</td>
     <td>Partial ordering of variadic class template partial specializations</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="693">
     <td><a href="https://cplusplus.github.io/CWG/issues/693.html">693</a></td>
+    <td>[<a href="https://wg21.link/conv.array">conv.array</a>]</td>
     <td>CD2</td>
     <td>New string types and deprecated conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="694">
     <td><a href="https://cplusplus.github.io/CWG/issues/694.html">694</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Zero- and value-initialization of union objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="695">
     <td><a href="https://cplusplus.github.io/CWG/issues/695.html">695</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Compile-time calculation errors in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="696">
     <td><a href="https://cplusplus.github.io/CWG/issues/696.html">696</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>C++11</td>
     <td>Use of block-scope constants in local classes</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="697">
     <td><a href="https://cplusplus.github.io/CWG/issues/697.html">697</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Deduction rules apply to more than functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="698">
     <td><a href="https://cplusplus.github.io/CWG/issues/698.html">698</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>open</td>
     <td>The definition of &#8220;sequenced before&#8221; is too narrow</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="699">
     <td><a href="https://cplusplus.github.io/CWG/issues/699.html">699</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD2</td>
     <td>Must constexpr member functions be defined in the class <I>member-specification</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="700">
     <td><a href="https://cplusplus.github.io/CWG/issues/700.html">700</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr member functions of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="701">
     <td><a href="https://cplusplus.github.io/CWG/issues/701.html">701</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD2</td>
     <td>When is the array-to-pointer conversion applied?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="702">
     <td><a href="https://cplusplus.github.io/CWG/issues/702.html">702</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD2</td>
     <td>Preferring conversion to <TT>std::initializer_list</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="703">
     <td><a href="https://cplusplus.github.io/CWG/issues/703.html">703</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Narrowing for literals that cannot be exactly represented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="704">
     <td><a href="https://cplusplus.github.io/CWG/issues/704.html">704</a></td>
+    <td>[<a href="https://wg21.link/over.match.call">over.match.call</a>]</td>
     <td>CD2</td>
     <td>To which <I>postfix-expression</I>s does overload resolution apply?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="705">
     <td><a href="https://cplusplus.github.io/CWG/issues/705.html">705</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD2</td>
     <td>Suppressing argument-dependent lookup via parentheses</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="706">
     <td><a href="https://cplusplus.github.io/CWG/issues/706.html">706</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Use of <TT>auto</TT> with rvalue references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="707">
     <td><a href="https://cplusplus.github.io/CWG/issues/707.html">707</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>CD2</td>
     <td>Undefined behavior in integral-to-floating conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="708">
     <td><a href="https://cplusplus.github.io/CWG/issues/708.html">708</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>open</td>
     <td>Partial specialization of member templates of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="709">
     <td><a href="https://cplusplus.github.io/CWG/issues/709.html">709</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Enumeration names as <I>nested-name-specifier</I>s in deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="710">
     <td><a href="https://cplusplus.github.io/CWG/issues/710.html">710</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>CD2</td>
     <td>Data races during construction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="711">
     <td><a href="https://cplusplus.github.io/CWG/issues/711.html">711</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td><TT>auto</TT> with <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="712">
     <td><a href="https://cplusplus.github.io/CWG/issues/712.html">712</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Are integer constant operands of a <I>conditional-expression</I> &#8220;used?&#8221;</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="713">
     <td><a href="https://cplusplus.github.io/CWG/issues/713.html">713</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Unclear note about cv-qualified function types</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="714">
     <td><a href="https://cplusplus.github.io/CWG/issues/714.html">714</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>CD2</td>
     <td>Static const data members and <I>braced-init-list</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="715">
     <td><a href="https://cplusplus.github.io/CWG/issues/715.html">715</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Class member access constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="716">
     <td><a href="https://cplusplus.github.io/CWG/issues/716.html">716</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD2</td>
     <td>Specifications that should apply only to non-static union data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="717">
     <td><a href="https://cplusplus.github.io/CWG/issues/717.html">717</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Unintentional restrictions on the use of <TT>thread_local</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="718">
     <td><a href="https://cplusplus.github.io/CWG/issues/718.html">718</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Non-class, non-function friend declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="719">
     <td><a href="https://cplusplus.github.io/CWG/issues/719.html">719</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Specifications for <I>operator-function-id</I> that should also apply to <I>literal-operator-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="720">
     <td><a href="https://cplusplus.github.io/CWG/issues/720.html">720</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Need examples of <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="721">
     <td><a href="https://cplusplus.github.io/CWG/issues/721.html">721</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Where must a variable be initialized to be used in a constant expression?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="722">
     <td><a href="https://cplusplus.github.io/CWG/issues/722.html">722</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD2</td>
     <td>Can <TT>nullptr</TT> be passed to an ellipsis?</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="726">
     <td><a href="https://cplusplus.github.io/CWG/issues/726.html">726</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Atomic and non-atomic objects in the memory model</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="727">
     <td><a href="https://cplusplus.github.io/CWG/issues/727.html">727</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++17</td>
     <td>In-class explicit specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="728">
     <td><a href="https://cplusplus.github.io/CWG/issues/728.html">728</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>Restrictions on local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="729">
     <td><a href="https://cplusplus.github.io/CWG/issues/729.html">729</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD3</td>
     <td>Qualification conversions and handlers of reference-to-pointer type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="730">
     <td><a href="https://cplusplus.github.io/CWG/issues/730.html">730</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Explicit specializations of members of non-template classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="731">
     <td><a href="https://cplusplus.github.io/CWG/issues/731.html">731</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD2</td>
     <td>Omitted reference qualification of member function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="732">
     <td><a href="https://cplusplus.github.io/CWG/issues/732.html">732</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Late-specified return types in function definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="733">
     <td><a href="https://cplusplus.github.io/CWG/issues/733.html">733</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Reference qualification of copy assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="734">
     <td><a href="https://cplusplus.github.io/CWG/issues/734.html">734</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Are unique addresses required for namespace-scope variables?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="735">
     <td><a href="https://cplusplus.github.io/CWG/issues/735.html">735</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD2</td>
     <td>Missing case in specification of safely-derived pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="736">
     <td><a href="https://cplusplus.github.io/CWG/issues/736.html">736</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>NAD</td>
     <td>Is the <TT>&amp;</TT> <I>ref-qualifier</I> needed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="737">
     <td><a href="https://cplusplus.github.io/CWG/issues/737.html">737</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>CD2</td>
     <td>Uninitialized trailing characters in string initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="738">
     <td><a href="https://cplusplus.github.io/CWG/issues/738.html">738</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> not permitted by the syntax of constructor declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="739">
     <td><a href="https://cplusplus.github.io/CWG/issues/739.html">739</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD3</td>
     <td>Signedness of plain bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="740">
     <td><a href="https://cplusplus.github.io/CWG/issues/740.html">740</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Incorrect note on data races</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="741">
     <td><a href="https://cplusplus.github.io/CWG/issues/741.html">741</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>C++11</td>
     <td>&#8220;plain&#8221; <TT>long long</TT> bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="742">
     <td><a href="https://cplusplus.github.io/CWG/issues/742.html">742</a></td>
+    <td>[<a href="https://wg21.link/expr.post.incr">expr.post.incr</a>]</td>
     <td>open</td>
     <td>Postfix increment/decrement with long bit-field operands</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="743">
     <td><a href="https://cplusplus.github.io/CWG/issues/743.html">743</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>decltype</TT> in a <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="744">
     <td><a href="https://cplusplus.github.io/CWG/issues/744.html">744</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>CD2</td>
     <td>Matching template arguments with template template parameters with parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="745">
     <td><a href="https://cplusplus.github.io/CWG/issues/745.html">745</a></td>
+    <td>[<a href="https://wg21.link/cpp.error">cpp.error</a>]</td>
     <td>C++23</td>
     <td>Effect of ill-formedness resulting from <TT>#error</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="746">
     <td><a href="https://cplusplus.github.io/CWG/issues/746.html">746</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>auto</TT> in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="747">
     <td><a href="https://cplusplus.github.io/CWG/issues/747.html">747</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>dup</td>
     <td>Access of protected base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="749">
     <td><a href="https://cplusplus.github.io/CWG/issues/749.html">749</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>References to function types with a <I>cv-qualifier</I> or <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="750">
     <td><a href="https://cplusplus.github.io/CWG/issues/750.html">750</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Implementation constraints on reference-only closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="751">
     <td><a href="https://cplusplus.github.io/CWG/issues/751.html">751</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Deriving from closure classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="752">
     <td><a href="https://cplusplus.github.io/CWG/issues/752.html">752</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Name lookup in nested <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="753">
     <td><a href="https://cplusplus.github.io/CWG/issues/753.html">753</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Array names in lambda capture sets</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="754">
     <td><a href="https://cplusplus.github.io/CWG/issues/754.html">754</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lambda expressions in default arguments of block-scope function declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="755">
     <td><a href="https://cplusplus.github.io/CWG/issues/755.html">755</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD3</td>
     <td>Generalized <I>lambda-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="756">
     <td><a href="https://cplusplus.github.io/CWG/issues/756.html">756</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Dropping cv-qualification on members of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="757">
     <td><a href="https://cplusplus.github.io/CWG/issues/757.html">757</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Types without linkage in declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="758">
     <td><a href="https://cplusplus.github.io/CWG/issues/758.html">758</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td>Missing cases of declarations that are not definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="759">
     <td><a href="https://cplusplus.github.io/CWG/issues/759.html">759</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Destruction of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="760">
     <td><a href="https://cplusplus.github.io/CWG/issues/760.html">760</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td><TT>this</TT> inside a nested class of a non-static member function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="761">
     <td><a href="https://cplusplus.github.io/CWG/issues/761.html">761</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Inferred return type of closure object call operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="762">
     <td><a href="https://cplusplus.github.io/CWG/issues/762.html">762</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Name lookup in the <I>compound-statement</I> of a lambda expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="763">
     <td><a href="https://cplusplus.github.io/CWG/issues/763.html">763</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Is a closure object's <TT>operator()</TT> inline?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="764">
     <td><a href="https://cplusplus.github.io/CWG/issues/764.html">764</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Capturing unused variables in a lambda expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="765">
     <td><a href="https://cplusplus.github.io/CWG/issues/765.html">765</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD2</td>
     <td>Local types in inline functions with external linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="766">
     <td><a href="https://cplusplus.github.io/CWG/issues/766.html">766</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Where may lambda expressions appear?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="767">
     <td><a href="https://cplusplus.github.io/CWG/issues/767.html">767</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td><TT>void</TT> and other unnamed <I>lambda-parameter</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="768">
     <td><a href="https://cplusplus.github.io/CWG/issues/768.html">768</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Ellipsis in a lambda parameter list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="769">
     <td><a href="https://cplusplus.github.io/CWG/issues/769.html">769</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Initialization of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="770">
     <td><a href="https://cplusplus.github.io/CWG/issues/770.html">770</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD2</td>
     <td>Ambiguity in late-specified return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="771">
     <td><a href="https://cplusplus.github.io/CWG/issues/771.html">771</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Move-construction of reference members of closure objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="772">
     <td><a href="https://cplusplus.github.io/CWG/issues/772.html">772</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td><I>capture-default</I> in lambdas in local default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="773">
     <td><a href="https://cplusplus.github.io/CWG/issues/773.html">773</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Parentheses in address non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="774">
     <td><a href="https://cplusplus.github.io/CWG/issues/774.html">774</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Can a closure class be a POD?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="775">
     <td><a href="https://cplusplus.github.io/CWG/issues/775.html">775</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Capturing references to functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="776">
     <td><a href="https://cplusplus.github.io/CWG/issues/776.html">776</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD2</td>
     <td>Delegating constructors, destructors, and <TT>std::exit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="777">
     <td><a href="https://cplusplus.github.io/CWG/issues/777.html">777</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD2</td>
     <td>Default arguments and parameter packs</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="778">
     <td><a href="https://cplusplus.github.io/CWG/issues/778.html">778</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template parameter packs in non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="779">
     <td><a href="https://cplusplus.github.io/CWG/issues/779.html">779</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Rvalue reference members of closure objects?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="782">
     <td><a href="https://cplusplus.github.io/CWG/issues/782.html">782</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lambda expressions and argument-dependent lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="783">
     <td><a href="https://cplusplus.github.io/CWG/issues/783.html">783</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>open</td>
     <td>Definition of &#8220;argument&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="784">
     <td><a href="https://cplusplus.github.io/CWG/issues/784.html">784</a></td>
+    <td>[<a href="https://wg21.link/intro.structure">intro.structure</a>]</td>
     <td>C++11</td>
     <td>List of incompatibilities with the previous Standard</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="785">
     <td><a href="https://cplusplus.github.io/CWG/issues/785.html">785</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD2</td>
     <td>&#8220;Execution sequence&#8221; is inappropriate phraseology</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="786">
     <td><a href="https://cplusplus.github.io/CWG/issues/786.html">786</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>CD2</td>
     <td>Definition of &#8220;thread&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="787">
     <td><a href="https://cplusplus.github.io/CWG/issues/787.html">787</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD2</td>
     <td>Unnecessary lexical undefined behavior</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="788">
     <td><a href="https://cplusplus.github.io/CWG/issues/788.html">788</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD2</td>
     <td>Relationship between locale and values of the execution character set</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="789">
     <td><a href="https://cplusplus.github.io/CWG/issues/789.html">789</a></td>
+    <td>[<a href="https://wg21.link/lex.trigraph">lex.trigraph</a>]</td>
     <td>CD2</td>
     <td>Deprecating trigraphs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="790">
     <td><a href="https://cplusplus.github.io/CWG/issues/790.html">790</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>Concatenation of raw and non-raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="792">
     <td><a href="https://cplusplus.github.io/CWG/issues/792.html">792</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD2</td>
     <td>Effects of <TT>std::quick_exit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="793">
     <td><a href="https://cplusplus.github.io/CWG/issues/793.html">793</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD2</td>
     <td>Use of class members during destruction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="794">
     <td><a href="https://cplusplus.github.io/CWG/issues/794.html">794</a></td>
+    <td>[<a href="https://wg21.link/conv.mem">conv.mem</a>]</td>
     <td>NAD</td>
     <td>Base-derived conversion in member type of pointer-to-member conversion</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="795">
     <td><a href="https://cplusplus.github.io/CWG/issues/795.html">795</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>NAD</td>
     <td>Dependency of lambdas on <TT>&lt;functional&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="796">
     <td><a href="https://cplusplus.github.io/CWG/issues/796.html">796</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD2</td>
     <td>Lifetime of a closure object with members captured by reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="797">
     <td><a href="https://cplusplus.github.io/CWG/issues/797.html">797</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Converting a no-capture lambda to a function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="798">
     <td><a href="https://cplusplus.github.io/CWG/issues/798.html">798</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>C++11</td>
     <td>Overloaded subscript operator described in clause 5</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="799">
     <td><a href="https://cplusplus.github.io/CWG/issues/799.html">799</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Can <TT>reinterpret_cast</TT> be used to cast an operand to its own type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="800">
     <td><a href="https://cplusplus.github.io/CWG/issues/800.html">800</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td>Safely-derived pointers and object pointers converted from function pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="801">
     <td><a href="https://cplusplus.github.io/CWG/issues/801.html">801</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD2</td>
     <td>Casting away constness in a cast to rvalue reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="803">
     <td><a href="https://cplusplus.github.io/CWG/issues/803.html">803</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>CD2</td>
     <td><TT>sizeof</TT> an enumeration type with a fixed underlying type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="804">
     <td><a href="https://cplusplus.github.io/CWG/issues/804.html">804</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Deducing the type in <TT>new auto(x)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="805">
     <td><a href="https://cplusplus.github.io/CWG/issues/805.html">805</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD2</td>
     <td>Which exception to throw for overflow in array size calculation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="806">
     <td><a href="https://cplusplus.github.io/CWG/issues/806.html">806</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Enumeration types in integral constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="807">
     <td><a href="https://cplusplus.github.io/CWG/issues/807.html">807</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>typeid</TT> expressions in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="808">
     <td><a href="https://cplusplus.github.io/CWG/issues/808.html">808</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec">dcl.spec</a>]</td>
     <td>CD2</td>
     <td>Non-type <I>decl-specifier</I>s versus max-munch</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="809">
     <td><a href="https://cplusplus.github.io/CWG/issues/809.html">809</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Deprecation of the <TT>register</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="810">
     <td><a href="https://cplusplus.github.io/CWG/issues/810.html">810</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Block-scope <TT>thread_local</TT> variables should be implicitly <TT>static</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="811">
     <td><a href="https://cplusplus.github.io/CWG/issues/811.html">811</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>CD2</td>
     <td>Unclear implications of const-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="812">
     <td><a href="https://cplusplus.github.io/CWG/issues/812.html">812</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Duplicate names in inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="813">
     <td><a href="https://cplusplus.github.io/CWG/issues/813.html">813</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td><TT>typename</TT> in a <I>using-declaration</I> with a non-dependent name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="814">
     <td><a href="https://cplusplus.github.io/CWG/issues/814.html">814</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Attribute to indicate that a function throws nothing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="815">
     <td><a href="https://cplusplus.github.io/CWG/issues/815.html">815</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Parameter pack expansion inside attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="816">
     <td><a href="https://cplusplus.github.io/CWG/issues/816.html">816</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.final">dcl.attr.final</a>]</td>
     <td>CD2</td>
     <td>Diagnosing violations of <TT>[[final]]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="817">
     <td><a href="https://cplusplus.github.io/CWG/issues/817.html">817</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.final">dcl.attr.final</a>]</td>
     <td>CD2</td>
     <td>Meaning of <TT>[[final]]</TT> applied to a class definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="818">
     <td><a href="https://cplusplus.github.io/CWG/issues/818.html">818</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Function parameter packs in non-final positions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="819">
     <td><a href="https://cplusplus.github.io/CWG/issues/819.html">819</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>NAD</td>
     <td>Access control and deleted implicitly-declared special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="820">
     <td><a href="https://cplusplus.github.io/CWG/issues/820.html">820</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>CD2</td>
     <td>Deprecation of <TT>export</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="822">
     <td><a href="https://cplusplus.github.io/CWG/issues/822.html">822</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>Additional contexts for template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="823">
     <td><a href="https://cplusplus.github.io/CWG/issues/823.html">823</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD2</td>
     <td>Literal types with constexpr conversions as non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="828">
     <td><a href="https://cplusplus.github.io/CWG/issues/828.html">828</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD2</td>
     <td>Destruction of exception objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="829">
     <td><a href="https://cplusplus.github.io/CWG/issues/829.html">829</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>At what point is <TT>std::unexpected</TT> called?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="830">
     <td><a href="https://cplusplus.github.io/CWG/issues/830.html">830</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD2</td>
     <td>Deprecating exception specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="831">
     <td><a href="https://cplusplus.github.io/CWG/issues/831.html">831</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD2</td>
     <td>Limit on recursively nested template instantiations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="832">
     <td><a href="https://cplusplus.github.io/CWG/issues/832.html">832</a></td>
+    <td>[<a href="https://wg21.link/lex.ppnumber">lex.ppnumber</a>]</td>
     <td>CD2</td>
     <td>Value of preprocessing numbers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="833">
     <td><a href="https://cplusplus.github.io/CWG/issues/833.html">833</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD2</td>
     <td>Explicit conversion of a scoped enumeration value to a floating type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="834">
     <td><a href="https://cplusplus.github.io/CWG/issues/834.html">834</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>What is an &#8220;ordinary string literal&#8221;?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="835">
     <td><a href="https://cplusplus.github.io/CWG/issues/835.html">835</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Scoped enumerations and the &#8220;usual arithmetic conversions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="836">
     <td><a href="https://cplusplus.github.io/CWG/issues/836.html">836</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.noreturn">dcl.attr.noreturn</a>]</td>
     <td>NAD</td>
     <td><TT>[[noreturn]]</TT> applied to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="837">
     <td><a href="https://cplusplus.github.io/CWG/issues/837.html">837</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr functions and <TT>return</TT> <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="838">
     <td><a href="https://cplusplus.github.io/CWG/issues/838.html">838</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Use of <TT>this</TT> in a <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="839">
     <td><a href="https://cplusplus.github.io/CWG/issues/839.html">839</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>dup</td>
     <td><TT>sizeof</TT> with opaque enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="840">
     <td><a href="https://cplusplus.github.io/CWG/issues/840.html">840</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD2</td>
     <td>Rvalue references as nontype template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="842">
     <td><a href="https://cplusplus.github.io/CWG/issues/842.html">842</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD2</td>
     <td>Casting to rvalue reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="845">
     <td><a href="https://cplusplus.github.io/CWG/issues/845.html">845</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>What is the &#8220;first declaration&#8221; of an explicit specialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="846">
     <td><a href="https://cplusplus.github.io/CWG/issues/846.html">846</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD2</td>
     <td>Rvalue references to functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="847">
     <td><a href="https://cplusplus.github.io/CWG/issues/847.html">847</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD2</td>
     <td>Error in rvalue reference deduction example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="850">
     <td><a href="https://cplusplus.github.io/CWG/issues/850.html">850</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD2</td>
     <td>Restrictions on use of non-static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="852">
     <td><a href="https://cplusplus.github.io/CWG/issues/852.html">852</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s and dependent base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="853">
     <td><a href="https://cplusplus.github.io/CWG/issues/853.html">853</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD2</td>
     <td>Support for relaxed pointer safety</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="854">
     <td><a href="https://cplusplus.github.io/CWG/issues/854.html">854</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD2</td>
     <td>Left shift and unsigned extended types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="855">
     <td><a href="https://cplusplus.github.io/CWG/issues/855.html">855</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD2</td>
     <td>Incorrect comments in <I>braced-init-list</I> assignment example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="858">
     <td><a href="https://cplusplus.github.io/CWG/issues/858.html">858</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD2</td>
     <td>Example binding an rvalue reference to an lvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="860">
     <td><a href="https://cplusplus.github.io/CWG/issues/860.html">860</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Explicit qualification of constexpr member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="861">
     <td><a href="https://cplusplus.github.io/CWG/issues/861.html">861</a></td>
+    <td>[<a href="https://wg21.link/namespace.qual">namespace.qual</a>]</td>
     <td>CD2</td>
     <td>Unintended ambiguity in inline namespace lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="862">
     <td><a href="https://cplusplus.github.io/CWG/issues/862.html">862</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD2</td>
     <td>Undefined behavior with enumerator value overflow</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="863">
     <td><a href="https://cplusplus.github.io/CWG/issues/863.html">863</a></td>
+    <td>[<a href="https://wg21.link/expr.post">expr.post</a>]</td>
     <td>CD2</td>
     <td>Rvalue reference cast to incomplete type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="864">
     <td><a href="https://cplusplus.github.io/CWG/issues/864.html">864</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>C++11</td>
     <td><I>braced-init-list</I> in the range-based <TT>for</TT> statement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="865">
     <td><a href="https://cplusplus.github.io/CWG/issues/865.html">865</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Initializing a <TT>std::initializer_list</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="869">
     <td><a href="https://cplusplus.github.io/CWG/issues/869.html">869</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD2</td>
     <td>Uninitialized <TT>thread_local</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="872">
     <td><a href="https://cplusplus.github.io/CWG/issues/872.html">872</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>Lexical issues with raw strings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="873">
     <td><a href="https://cplusplus.github.io/CWG/issues/873.html">873</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++11</td>
     <td>Deducing rvalue references in declarative contexts</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="874">
     <td><a href="https://cplusplus.github.io/CWG/issues/874.html">874</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD2</td>
     <td>Class-scope definitions of enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="876">
     <td><a href="https://cplusplus.github.io/CWG/issues/876.html">876</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD2</td>
     <td>Type references in rvalue reference deduction specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="877">
     <td><a href="https://cplusplus.github.io/CWG/issues/877.html">877</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>CD2</td>
     <td>Viable functions and binding references to rvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="879">
     <td><a href="https://cplusplus.github.io/CWG/issues/879.html">879</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>Missing built-in comparison operators for pointer types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="880">
     <td><a href="https://cplusplus.github.io/CWG/issues/880.html">880</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>CD2</td>
     <td>Built-in conditional operator for scoped enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="882">
     <td><a href="https://cplusplus.github.io/CWG/issues/882.html">882</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD2</td>
     <td>Defining <TT>main</TT> as deleted</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="883">
     <td><a href="https://cplusplus.github.io/CWG/issues/883.html">883</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD2</td>
     <td><TT>std::memcpy</TT> vs <TT>std::memmove</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="884">
     <td><a href="https://cplusplus.github.io/CWG/issues/884.html">884</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Defining an explicitly-specialized static data member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="885">
     <td><a href="https://cplusplus.github.io/CWG/issues/885.html">885</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>NAD</td>
     <td>Partial ordering of function templates with unordered parameter pairs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="886">
     <td><a href="https://cplusplus.github.io/CWG/issues/886.html">886</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD2</td>
     <td>Member initializers and aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="887">
     <td><a href="https://cplusplus.github.io/CWG/issues/887.html">887</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Move construction of thrown object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="888">
     <td><a href="https://cplusplus.github.io/CWG/issues/888.html">888</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD2</td>
     <td>Union member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="891">
     <td><a href="https://cplusplus.github.io/CWG/issues/891.html">891</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD2</td>
     <td><TT>const_cast</TT> to rvalue reference from objectless rvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="892">
     <td><a href="https://cplusplus.github.io/CWG/issues/892.html">892</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Missing requirements for constexpr constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="893">
     <td><a href="https://cplusplus.github.io/CWG/issues/893.html">893</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>Brace syntax for <I>enumerator-definition</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="896">
     <td><a href="https://cplusplus.github.io/CWG/issues/896.html">896</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD2</td>
     <td>Rvalue references and rvalue-reference conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="897">
     <td><a href="https://cplusplus.github.io/CWG/issues/897.html">897</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td><TT>_Pragma</TT> and extended <I>string-literal</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="898">
     <td><a href="https://cplusplus.github.io/CWG/issues/898.html">898</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Declarations in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="899">
     <td><a href="https://cplusplus.github.io/CWG/issues/899.html">899</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD2</td>
     <td>Explicit conversion functions in direct class initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="900">
     <td><a href="https://cplusplus.github.io/CWG/issues/900.html">900</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>C++23</td>
     <td>Lifetime of temporaries in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="901">
     <td><a href="https://cplusplus.github.io/CWG/issues/901.html">901</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Deleted <TT>operator delete</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="902">
     <td><a href="https://cplusplus.github.io/CWG/issues/902.html">902</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>In-class initialization of non-constant static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="903">
     <td><a href="https://cplusplus.github.io/CWG/issues/903.html">903</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD3</td>
     <td>Value-dependent integral null pointer constants</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="904">
     <td><a href="https://cplusplus.github.io/CWG/issues/904.html">904</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD2</td>
     <td>Parameter packs in <I>lambda-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="905">
     <td><a href="https://cplusplus.github.io/CWG/issues/905.html">905</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD2</td>
     <td>Explicit defaulted copy constructors and trivial copyability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="906">
     <td><a href="https://cplusplus.github.io/CWG/issues/906.html">906</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Which special member functions can be defaulted?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="908">
     <td><a href="https://cplusplus.github.io/CWG/issues/908.html">908</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Deleted global allocation and deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="909">
     <td><a href="https://cplusplus.github.io/CWG/issues/909.html">909</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>NAD</td>
     <td>Old-style casts with conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="910">
     <td><a href="https://cplusplus.github.io/CWG/issues/910.html">910</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD2</td>
     <td>Move constructors and implicitly-declared copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="912">
     <td><a href="https://cplusplus.github.io/CWG/issues/912.html">912</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD3</td>
     <td>Character literals and <I>universal-character-name</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="913">
     <td><a href="https://cplusplus.github.io/CWG/issues/913.html">913</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Deduction rules for array- and function-type conversion functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="914">
     <td><a href="https://cplusplus.github.io/CWG/issues/914.html">914</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>open</td>
     <td>Value-initialization of array types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="915">
     <td><a href="https://cplusplus.github.io/CWG/issues/915.html">915</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Deleted specializations of member function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="919">
     <td><a href="https://cplusplus.github.io/CWG/issues/919.html">919</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Contradictions regarding inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="920">
     <td><a href="https://cplusplus.github.io/CWG/issues/920.html">920</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD2</td>
     <td>Interaction of inline namespaces and <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="921">
     <td><a href="https://cplusplus.github.io/CWG/issues/921.html">921</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD2</td>
     <td>Unclear specification of inline namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="922">
     <td><a href="https://cplusplus.github.io/CWG/issues/922.html">922</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD2</td>
     <td>Implicit default constructor definitions and <TT>const</TT> variant members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="923">
     <td><a href="https://cplusplus.github.io/CWG/issues/923.html">923</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD2</td>
     <td>Inline explicit specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="924">
     <td><a href="https://cplusplus.github.io/CWG/issues/924.html">924</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td><I>alias-declaration</I> as a class member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="925">
     <td><a href="https://cplusplus.github.io/CWG/issues/925.html">925</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Type of character literals in preprocessor expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="926">
     <td><a href="https://cplusplus.github.io/CWG/issues/926.html">926</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>CD2</td>
     <td>Inline unnamed namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="927">
     <td><a href="https://cplusplus.github.io/CWG/issues/927.html">927</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD2</td>
     <td>Implicitly-deleted default constructors and member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="928">
     <td><a href="https://cplusplus.github.io/CWG/issues/928.html">928</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def">dcl.fct.def</a>]</td>
     <td>CD2</td>
     <td>Defaulting a function that would be implicitly defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="929">
     <td><a href="https://cplusplus.github.io/CWG/issues/929.html">929</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD2</td>
     <td>What is a template alias?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="930">
     <td><a href="https://cplusplus.github.io/CWG/issues/930.html">930</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD2</td>
     <td><TT>alignof</TT> with incomplete array type</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="931">
     <td><a href="https://cplusplus.github.io/CWG/issues/931.html">931</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>CD2</td>
     <td>Confusing reference to the length of a user-defined string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="932">
     <td><a href="https://cplusplus.github.io/CWG/issues/932.html">932</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD2</td>
     <td>UCNs in closing delimiters of raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="933">
     <td><a href="https://cplusplus.github.io/CWG/issues/933.html">933</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD2</td>
     <td>32-bit UCNs with 16-bit <TT>wchar_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="934">
     <td><a href="https://cplusplus.github.io/CWG/issues/934.html">934</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>List-initialization of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="935">
     <td><a href="https://cplusplus.github.io/CWG/issues/935.html">935</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD2</td>
     <td>Missing overloads for character types for user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="936">
     <td><a href="https://cplusplus.github.io/CWG/issues/936.html">936</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>CD2</td>
     <td>Array initialization with new string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="937">
     <td><a href="https://cplusplus.github.io/CWG/issues/937.html">937</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Restrictions on values of template arguments in user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="938">
     <td><a href="https://cplusplus.github.io/CWG/issues/938.html">938</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Initializer lists and array new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="939">
     <td><a href="https://cplusplus.github.io/CWG/issues/939.html">939</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Explicitly checking virtual function overriding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="940">
     <td><a href="https://cplusplus.github.io/CWG/issues/940.html">940</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD2</td>
     <td>Global anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="941">
     <td><a href="https://cplusplus.github.io/CWG/issues/941.html">941</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++11</td>
     <td>Explicit specialization of deleted function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="942">
     <td><a href="https://cplusplus.github.io/CWG/issues/942.html">942</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>CD2</td>
     <td>Is <TT>this</TT> an entity?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="943">
     <td><a href="https://cplusplus.github.io/CWG/issues/943.html">943</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td>Is <TT>T()</TT> a temporary?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="944">
     <td><a href="https://cplusplus.github.io/CWG/issues/944.html">944</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> for all types with the same size and alignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="945">
     <td><a href="https://cplusplus.github.io/CWG/issues/945.html">945</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>C++11</td>
     <td>Use of <TT>this</TT> in a late-specified return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="946">
     <td><a href="https://cplusplus.github.io/CWG/issues/946.html">946</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>CD2</td>
     <td>Order of destruction of local static objects and calls to <TT>std::atexit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="947">
     <td><a href="https://cplusplus.github.io/CWG/issues/947.html">947</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>NAD</td>
     <td>Deducing type template arguments from default function arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="948">
     <td><a href="https://cplusplus.github.io/CWG/issues/948.html">948</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> in <I>condition</I>s</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr class="open" id="949">
     <td><a href="https://cplusplus.github.io/CWG/issues/949.html">949</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance">intro.compliance</a>]</td>
     <td>open</td>
     <td>Requirements for freestanding implementations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="950">
     <td><a href="https://cplusplus.github.io/CWG/issues/950.html">950</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD2</td>
     <td>Use of <TT>decltype</TT> as a <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="951">
     <td><a href="https://cplusplus.github.io/CWG/issues/951.html">951</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Problems with <I>attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="952">
     <td><a href="https://cplusplus.github.io/CWG/issues/952.html">952</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD6</td>
     <td>Insufficient description of &#8220;naming class&#8221;</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="953">
     <td><a href="https://cplusplus.github.io/CWG/issues/953.html">953</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD2</td>
     <td>Rvalue references and function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="954">
     <td><a href="https://cplusplus.github.io/CWG/issues/954.html">954</a></td>
+    <td>[<a href="https://wg21.link/over.built">over.built</a>]</td>
     <td>open</td>
     <td>Overload resolution of conversion operator templates with built-in types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="955">
     <td><a href="https://cplusplus.github.io/CWG/issues/955.html">955</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD2</td>
     <td>Can a closure type's <TT>operator()</TT> be virtual?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="956">
     <td><a href="https://cplusplus.github.io/CWG/issues/956.html">956</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD2</td>
     <td>Function prototype scope with late-specified return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="957">
     <td><a href="https://cplusplus.github.io/CWG/issues/957.html">957</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Alternative tokens and <I>attribute-token</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="958">
     <td><a href="https://cplusplus.github.io/CWG/issues/958.html">958</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>NAD</td>
     <td>Lambdas and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="959">
     <td><a href="https://cplusplus.github.io/CWG/issues/959.html">959</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD2</td>
     <td>Alignment attribute for class and enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="960">
     <td><a href="https://cplusplus.github.io/CWG/issues/960.html">960</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD2</td>
     <td>Covariant functions and lvalue/rvalue references</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="961">
     <td><a href="https://cplusplus.github.io/CWG/issues/961.html">961</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD2</td>
     <td>Overload resolution and conversion of <TT>std::nullptr_t</TT> to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="962">
     <td><a href="https://cplusplus.github.io/CWG/issues/962.html">962</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD2</td>
     <td>Attributes appertaining to class and enum types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="963">
     <td><a href="https://cplusplus.github.io/CWG/issues/963.html">963</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD2</td>
     <td>Comparing <TT>nullptr</TT> with 0</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="964">
     <td><a href="https://cplusplus.github.io/CWG/issues/964.html">964</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of when the lvalue-to-rvalue conversion applies</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="965">
     <td><a href="https://cplusplus.github.io/CWG/issues/965.html">965</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.depend">dcl.attr.depend</a>]</td>
     <td>CD2</td>
     <td>Limiting the applicability of the <TT>carries_dependency</TT> attribute</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="966">
     <td><a href="https://cplusplus.github.io/CWG/issues/966.html">966</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD2</td>
     <td>Nested types without linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="967">
     <td><a href="https://cplusplus.github.io/CWG/issues/967.html">967</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic">basic.stc.dynamic</a>]</td>
     <td>NAD</td>
     <td>Exception specification of replacement allocation function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="968">
     <td><a href="https://cplusplus.github.io/CWG/issues/968.html">968</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD2</td>
     <td>Syntactic ambiguity of the attribute notation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="969">
     <td><a href="https://cplusplus.github.io/CWG/issues/969.html">969</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Explicit instantiation declarations of class template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="970">
     <td><a href="https://cplusplus.github.io/CWG/issues/970.html">970</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>CD2</td>
     <td>Consistent use of &#8220;appertain&#8221; and &#8220;apply&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="971">
     <td><a href="https://cplusplus.github.io/CWG/issues/971.html">971</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td>Incorrect treatment of <I>exception-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="972">
     <td><a href="https://cplusplus.github.io/CWG/issues/972.html">972</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++11</td>
     <td>Allowing multiple <I>attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="973">
     <td><a href="https://cplusplus.github.io/CWG/issues/973.html">973</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD2</td>
     <td>Function types in <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="974">
     <td><a href="https://cplusplus.github.io/CWG/issues/974.html">974</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td>Default arguments for lambdas</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="975">
     <td><a href="https://cplusplus.github.io/CWG/issues/975.html">975</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td>Restrictions on return type deduction for lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="976">
     <td><a href="https://cplusplus.github.io/CWG/issues/976.html">976</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD2</td>
     <td>Deduction for <TT>const T&amp;</TT> conversion operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="977">
     <td><a href="https://cplusplus.github.io/CWG/issues/977.html">977</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD3</td>
     <td>When is an enumeration type complete?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="978">
     <td><a href="https://cplusplus.github.io/CWG/issues/978.html">978</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD2</td>
     <td>Incorrect specification for copy initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="979">
     <td><a href="https://cplusplus.github.io/CWG/issues/979.html">979</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD2</td>
     <td>Position of <I>attribute-specifier</I> in declarator syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="980">
     <td><a href="https://cplusplus.github.io/CWG/issues/980.html">980</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Explicit instantiation of a member of a class template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="981">
     <td><a href="https://cplusplus.github.io/CWG/issues/981.html">981</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Constexpr constructor templates and literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="982">
     <td><a href="https://cplusplus.github.io/CWG/issues/982.html">982</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Initialization with an empty initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="983">
     <td><a href="https://cplusplus.github.io/CWG/issues/983.html">983</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD2</td>
     <td>Ambiguous pointer-to-member constant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="984">
     <td><a href="https://cplusplus.github.io/CWG/issues/984.html">984</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD2</td>
     <td>&#8220;Deduced type&#8221; is unclear in <TT>auto</TT> type deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="985">
     <td><a href="https://cplusplus.github.io/CWG/issues/985.html">985</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>C++11</td>
     <td>Alternative tokens and user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="986">
     <td><a href="https://cplusplus.github.io/CWG/issues/986.html">986</a></td>
+    <td>[<a href="https://wg21.link/namespace.udir">namespace.udir</a>]</td>
     <td>CD2</td>
     <td>Transitivity of <I>using-directive</I>s versus qualified lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="987">
     <td><a href="https://cplusplus.github.io/CWG/issues/987.html">987</a></td>
+    <td>[<a href="https://wg21.link/basic.namespace">basic.namespace</a>]</td>
     <td>CD4</td>
     <td>Which declarations introduce namespace members?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="988">
     <td><a href="https://cplusplus.github.io/CWG/issues/988.html">988</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD2</td>
     <td>Reference-to-reference collapsing with <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="989">
     <td><a href="https://cplusplus.github.io/CWG/issues/989.html">989</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Misplaced list-initialization example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="990">
     <td><a href="https://cplusplus.github.io/CWG/issues/990.html">990</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD2</td>
     <td>Value initialization with multiple initializer-list constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="991">
     <td><a href="https://cplusplus.github.io/CWG/issues/991.html">991</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD2</td>
     <td>Reference parameters of constexpr functions and constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="992">
     <td><a href="https://cplusplus.github.io/CWG/issues/992.html">992</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Inheriting explicitness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="993">
     <td><a href="https://cplusplus.github.io/CWG/issues/993.html">993</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>C++11</td>
     <td>Freedom to perform instantiation at the end of the translation unit</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="994">
     <td><a href="https://cplusplus.github.io/CWG/issues/994.html">994</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td><I>braced-init-list</I> as a default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="995">
     <td><a href="https://cplusplus.github.io/CWG/issues/995.html">995</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD2</td>
     <td>Incorrect example for <I>using-declaration</I> and explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="996">
     <td><a href="https://cplusplus.github.io/CWG/issues/996.html">996</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>C++11</td>
     <td>Ambiguous partial specializations of member class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="997">
     <td><a href="https://cplusplus.github.io/CWG/issues/997.html">997</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++11</td>
     <td>Argument-dependent lookup and dependent function template parameter types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="998">
     <td><a href="https://cplusplus.github.io/CWG/issues/998.html">998</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>dup</td>
     <td>Function parameter transformations and template functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="999">
     <td><a href="https://cplusplus.github.io/CWG/issues/999.html">999</a></td>
+    <td>[<a href="https://wg21.link/over.match">over.match</a>]</td>
     <td>CD2</td>
     <td>&#8220;Implicit&#8221; or &#8220;implied&#8221; object argument/parameter?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1000">
     <td><a href="https://cplusplus.github.io/CWG/issues/1000.html">1000</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD2</td>
     <td>Mistaking member typedefs for constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1001">
     <td><a href="https://cplusplus.github.io/CWG/issues/1001.html">1001</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>review</td>
     <td>Parameter type adjustment in dependent parameter types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1002">
     <td><a href="https://cplusplus.github.io/CWG/issues/1002.html">1002</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>NAD</td>
     <td>Pack expansion for function arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1003">
     <td><a href="https://cplusplus.github.io/CWG/issues/1003.html">1003</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD3</td>
     <td>Acceptable definitions of <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1004">
     <td><a href="https://cplusplus.github.io/CWG/issues/1004.html">1004</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++11</td>
     <td>Injected-class-names as arguments for template template parameters</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="1005">
     <td><a href="https://cplusplus.github.io/CWG/issues/1005.html">1005</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>NAD</td>
     <td>Qualified name resolution in member functions of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1006">
     <td><a href="https://cplusplus.github.io/CWG/issues/1006.html">1006</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td><TT>std::nullptr_t</TT> as a non-type template parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1007">
     <td><a href="https://cplusplus.github.io/CWG/issues/1007.html">1007</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>NAD</td>
     <td>Protected access and pointers to members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1008">
     <td><a href="https://cplusplus.github.io/CWG/issues/1008.html">1008</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>NAD</td>
     <td>Querying the alignment of an object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1009">
     <td><a href="https://cplusplus.github.io/CWG/issues/1009.html">1009</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>C++11</td>
     <td>Missing cases in the <I>declarator-id</I> of a function template declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1010">
     <td><a href="https://cplusplus.github.io/CWG/issues/1010.html">1010</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD2</td>
     <td>Address of object with dynamic storage duration in constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1011">
     <td><a href="https://cplusplus.github.io/CWG/issues/1011.html">1011</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++11</td>
     <td>Standard conversions that cannot be inverted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1012">
     <td><a href="https://cplusplus.github.io/CWG/issues/1012.html">1012</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>C++11</td>
     <td>Undeprecating <TT>static</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1013">
     <td><a href="https://cplusplus.github.io/CWG/issues/1013.html">1013</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD3</td>
     <td>Uninitialized <TT>std::nullptr_t</TT> objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1014">
     <td><a href="https://cplusplus.github.io/CWG/issues/1014.html">1014</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Overload resolution between <TT>const T&amp;</TT> and <TT>T&amp;&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1015">
     <td><a href="https://cplusplus.github.io/CWG/issues/1015.html">1015</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++11</td>
     <td>Template arguments and argument-dependent lookup</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1016">
     <td><a href="https://cplusplus.github.io/CWG/issues/1016.html">1016</a></td>
+    <td>[<a href="https://wg21.link/over">over</a>]</td>
     <td>C++11</td>
     <td>Overloadable declarations, function templates, and references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1017">
     <td><a href="https://cplusplus.github.io/CWG/issues/1017.html">1017</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Member access transformation in unevaluated operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1018">
     <td><a href="https://cplusplus.github.io/CWG/issues/1018.html">1018</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++11</td>
     <td>Ambiguity between <I>simple-declaration</I> and <I>attribute-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1019">
     <td><a href="https://cplusplus.github.io/CWG/issues/1019.html">1019</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>dup</td>
     <td>Dependent <I>simple-template-id</I>s in <I>base-specifier</I>s and <I>mem-initializer</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1020">
     <td><a href="https://cplusplus.github.io/CWG/issues/1020.html">1020</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Implicitly-defined copy constructors and explicit base class constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1021">
     <td><a href="https://cplusplus.github.io/CWG/issues/1021.html">1021</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD4</td>
     <td>Definitions of namespace members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1022">
     <td><a href="https://cplusplus.github.io/CWG/issues/1022.html">1022</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++11</td>
     <td>Can an enumeration variable have values outside the values of the enumeration?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1023">
     <td><a href="https://cplusplus.github.io/CWG/issues/1023.html">1023</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>dup</td>
     <td><TT>thread_local</TT> objects as non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1024">
     <td><a href="https://cplusplus.github.io/CWG/issues/1024.html">1024</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD3</td>
     <td>Limits on multicharacter literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1025">
     <td><a href="https://cplusplus.github.io/CWG/issues/1025.html">1025</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Use of a reference as a non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1026">
     <td><a href="https://cplusplus.github.io/CWG/issues/1026.html">1026</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Cv-qualified non-class rvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1027">
     <td><a href="https://cplusplus.github.io/CWG/issues/1027.html">1027</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Type consistency and reallocation of scalar types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1028">
     <td><a href="https://cplusplus.github.io/CWG/issues/1028.html">1028</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.res">temp.dep.res</a>]</td>
     <td>CD6</td>
     <td>Dependent names in non-defining declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1029">
     <td><a href="https://cplusplus.github.io/CWG/issues/1029.html">1029</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Type of a destructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1030">
     <td><a href="https://cplusplus.github.io/CWG/issues/1030.html">1030</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Evaluation order in <I>initializer-list</I>s used in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1031">
     <td><a href="https://cplusplus.github.io/CWG/issues/1031.html">1031</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++11</td>
     <td>Optional elements in attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1032">
     <td><a href="https://cplusplus.github.io/CWG/issues/1032.html">1032</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Empty pack expansions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1033">
     <td><a href="https://cplusplus.github.io/CWG/issues/1033.html">1033</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>C++11</td>
     <td>Restrictions on alignment attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1034">
     <td><a href="https://cplusplus.github.io/CWG/issues/1034.html">1034</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++11</td>
     <td>Attributes for <TT>return</TT> statements in lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1035">
     <td><a href="https://cplusplus.github.io/CWG/issues/1035.html">1035</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td>Omitted and required <I>decl-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1036">
     <td><a href="https://cplusplus.github.io/CWG/issues/1036.html">1036</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>C++11</td>
     <td>Alignment attribute in an <I>exception-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1037">
     <td><a href="https://cplusplus.github.io/CWG/issues/1037.html">1037</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++11</td>
     <td>Requirements for operands of <I>delete-expression</I>s and deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1038">
     <td><a href="https://cplusplus.github.io/CWG/issues/1038.html">1038</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD7</td>
     <td>Overload resolution of <TT>&amp;x.static_func</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1039">
     <td><a href="https://cplusplus.github.io/CWG/issues/1039.html">1039</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>dup</td>
     <td>Coordinating C and C++ alignment specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1040">
     <td><a href="https://cplusplus.github.io/CWG/issues/1040.html">1040</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Memory model issues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1041">
     <td><a href="https://cplusplus.github.io/CWG/issues/1041.html">1041</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>dup</td>
     <td><I>alias-declaration</I>s as class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1042">
     <td><a href="https://cplusplus.github.io/CWG/issues/1042.html">1042</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++11</td>
     <td>Attributes in <I>alias-declaration</I>s</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1043">
     <td><a href="https://cplusplus.github.io/CWG/issues/1043.html">1043</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Qualified name lookup in the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1044">
     <td><a href="https://cplusplus.github.io/CWG/issues/1044.html">1044</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++11</td>
     <td>Point of declaration for an <I>alias-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1045">
     <td><a href="https://cplusplus.github.io/CWG/issues/1045.html">1045</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Requiring explicit instantiation declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1046">
     <td><a href="https://cplusplus.github.io/CWG/issues/1046.html">1046</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>open</td>
     <td>What is a &#8220;use&#8221; of a class specialization?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1047">
     <td><a href="https://cplusplus.github.io/CWG/issues/1047.html">1047</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>When is <TT>typeid</TT> value-dependent?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1048">
     <td><a href="https://cplusplus.github.io/CWG/issues/1048.html">1048</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD3</td>
     <td><TT>auto</TT> deduction and lambda return type deduction.</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr class="open" id="1049">
     <td><a href="https://cplusplus.github.io/CWG/issues/1049.html">1049</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>open</td>
     <td>Copy elision through reference parameters of inline functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1050">
     <td><a href="https://cplusplus.github.io/CWG/issues/1050.html">1050</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Effects of thread support on object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1051">
     <td><a href="https://cplusplus.github.io/CWG/issues/1051.html">1051</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Reference members and generated copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1052">
     <td><a href="https://cplusplus.github.io/CWG/issues/1052.html">1052</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>dup</td>
     <td><TT>const</TT> non-static data member and PODness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1053">
     <td><a href="https://cplusplus.github.io/CWG/issues/1053.html">1053</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Terminate vs undefined behavior for noexcept violation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1054">
     <td><a href="https://cplusplus.github.io/CWG/issues/1054.html">1054</a></td>
+    <td>[<a href="https://wg21.link/stmt.expr">stmt.expr</a>]</td>
     <td>C++11</td>
     <td>Lvalue-to-rvalue conversions in expression statements</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1055">
     <td><a href="https://cplusplus.github.io/CWG/issues/1055.html">1055</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++11</td>
     <td>Permissible uses of <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1056">
     <td><a href="https://cplusplus.github.io/CWG/issues/1056.html">1056</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Template aliases, member definitions, and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1057">
     <td><a href="https://cplusplus.github.io/CWG/issues/1057.html">1057</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td><TT>decltype</TT> and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1058">
     <td><a href="https://cplusplus.github.io/CWG/issues/1058.html">1058</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Reference binding of incompatible array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1059">
     <td><a href="https://cplusplus.github.io/CWG/issues/1059.html">1059</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>CD3</td>
     <td>Cv-qualified array types (with rvalues)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1060">
     <td><a href="https://cplusplus.github.io/CWG/issues/1060.html">1060</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Scoped enumerators in integral constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1061">
     <td><a href="https://cplusplus.github.io/CWG/issues/1061.html">1061</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++11</td>
     <td>Negative array bounds in a <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1062">
     <td><a href="https://cplusplus.github.io/CWG/issues/1062.html">1062</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++11</td>
     <td>Syntax of <I>attribute-specifier</I>s in lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1063">
     <td><a href="https://cplusplus.github.io/CWG/issues/1063.html">1063</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td><TT>[[hiding]]</TT> with non-attribute declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1064">
     <td><a href="https://cplusplus.github.io/CWG/issues/1064.html">1064</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Defaulted move constructor for a union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1065">
     <td><a href="https://cplusplus.github.io/CWG/issues/1065.html">1065</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td><TT>[[hiding]]</TT> with <TT>[[override]]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1066">
     <td><a href="https://cplusplus.github.io/CWG/issues/1066.html">1066</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>C++11</td>
     <td>When is a copy/move assignment operator implicitly defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1067">
     <td><a href="https://cplusplus.github.io/CWG/issues/1067.html">1067</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>NAD</td>
     <td><TT>[[hiding]]</TT>, <I>using-declaration</I>s, and multiple inheritance</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1068">
     <td><a href="https://cplusplus.github.io/CWG/issues/1068.html">1068</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Template aliases with default arguments and template parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1069">
     <td><a href="https://cplusplus.github.io/CWG/issues/1069.html">1069</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Incorrect function type with <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1070">
     <td><a href="https://cplusplus.github.io/CWG/issues/1070.html">1070</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++11</td>
     <td>Missing initializer clauses in aggregate initialization</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1071">
     <td><a href="https://cplusplus.github.io/CWG/issues/1071.html">1071</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Literal class types and trivial default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1072">
     <td><a href="https://cplusplus.github.io/CWG/issues/1072.html">1072</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++11</td>
     <td>Scoped enumerator with the same name as its containing class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1073">
     <td><a href="https://cplusplus.github.io/CWG/issues/1073.html">1073</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td>Merging <I>dynamic-exception-specification</I>s and <I>noexcept-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1074">
     <td><a href="https://cplusplus.github.io/CWG/issues/1074.html">1074</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>Value-dependent <I>noexcept-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1075">
     <td><a href="https://cplusplus.github.io/CWG/issues/1075.html">1075</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Grammar does not allow template alias in <I>type-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1076">
     <td><a href="https://cplusplus.github.io/CWG/issues/1076.html">1076</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD5</td>
     <td>Value categories and lvalue temporaries</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1077">
     <td><a href="https://cplusplus.github.io/CWG/issues/1077.html">1077</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>NAD</td>
     <td>Explicit specializations in non-containing namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1078">
     <td><a href="https://cplusplus.github.io/CWG/issues/1078.html">1078</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Narrowing and the usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1079">
     <td><a href="https://cplusplus.github.io/CWG/issues/1079.html">1079</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>C++11</td>
     <td>Overload resolution involving aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1080">
     <td><a href="https://cplusplus.github.io/CWG/issues/1080.html">1080</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Confusing relationship between templates and copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1081">
     <td><a href="https://cplusplus.github.io/CWG/issues/1081.html">1081</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Defaulted destructor and unusable operator delete</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1082">
     <td><a href="https://cplusplus.github.io/CWG/issues/1082.html">1082</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Implicit copy function if subobject has none?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1083">
     <td><a href="https://cplusplus.github.io/CWG/issues/1083.html">1083</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++11</td>
     <td>Passing an object to ellipsis with non-trivial move constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1084">
     <td><a href="https://cplusplus.github.io/CWG/issues/1084.html">1084</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Conditions for a deleted move function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1085">
     <td><a href="https://cplusplus.github.io/CWG/issues/1085.html">1085</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td>Move assignment operators and virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1086">
     <td><a href="https://cplusplus.github.io/CWG/issues/1086.html">1086</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>C++11</td>
     <td><TT>const_cast</TT> to rvalue reference to function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1087">
     <td><a href="https://cplusplus.github.io/CWG/issues/1087.html">1087</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>C++11</td>
     <td>Additional applications of issue 899</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1088">
     <td><a href="https://cplusplus.github.io/CWG/issues/1088.html">1088</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++11</td>
     <td>Dependent non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1089">
     <td><a href="https://cplusplus.github.io/CWG/issues/1089.html">1089</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>open</td>
     <td>Template parameters in member selections</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1090">
     <td><a href="https://cplusplus.github.io/CWG/issues/1090.html">1090</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>Alignment of subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1091">
     <td><a href="https://cplusplus.github.io/CWG/issues/1091.html">1091</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>C++11</td>
     <td>Inconsistent use of the term &#8220;object expression&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1092">
     <td><a href="https://cplusplus.github.io/CWG/issues/1092.html">1092</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Cycles in overload resolution during instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1093">
     <td><a href="https://cplusplus.github.io/CWG/issues/1093.html">1093</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value-initializing non-objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1094">
     <td><a href="https://cplusplus.github.io/CWG/issues/1094.html">1094</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++11</td>
     <td>Converting floating-point values to scoped enumeration types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1095">
     <td><a href="https://cplusplus.github.io/CWG/issues/1095.html">1095</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++11</td>
     <td>List-initialization of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1096">
     <td><a href="https://cplusplus.github.io/CWG/issues/1096.html">1096</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>C++11</td>
     <td>Missing requirement for template definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1097">
     <td><a href="https://cplusplus.github.io/CWG/issues/1097.html">1097</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Aggregate initialization of function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1098">
     <td><a href="https://cplusplus.github.io/CWG/issues/1098.html">1098</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Pointer conversions in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1099">
     <td><a href="https://cplusplus.github.io/CWG/issues/1099.html">1099</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Infinite recursion in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1100">
     <td><a href="https://cplusplus.github.io/CWG/issues/1100.html">1100</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> conversion functions and non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1101">
     <td><a href="https://cplusplus.github.io/CWG/issues/1101.html">1101</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>C++11</td>
     <td>Non-integral initialized static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1102">
     <td><a href="https://cplusplus.github.io/CWG/issues/1102.html">1102</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++11</td>
     <td>Better example of undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1103">
     <td><a href="https://cplusplus.github.io/CWG/issues/1103.html">1103</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++11</td>
     <td>Reversion of phase 1 and 2 transformations in raw string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1104">
     <td><a href="https://cplusplus.github.io/CWG/issues/1104.html">1104</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>C++11</td>
     <td>Global-scope template arguments vs the <TT>&lt;:</TT> digraph</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1105">
     <td><a href="https://cplusplus.github.io/CWG/issues/1105.html">1105</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>C++11</td>
     <td>Issues relating to TR 10176:2003</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1106">
     <td><a href="https://cplusplus.github.io/CWG/issues/1106.html">1106</a></td>
+    <td>[<a href="https://wg21.link/lex.nullptr">lex.nullptr</a>]</td>
     <td>C++11</td>
     <td>Need more detail in <TT>nullptr</TT> keyword description</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1107">
     <td><a href="https://cplusplus.github.io/CWG/issues/1107.html">1107</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Overload resolution for user-defined integer literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1108">
     <td><a href="https://cplusplus.github.io/CWG/issues/1108.html">1108</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>User-defined literals have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1109">
     <td><a href="https://cplusplus.github.io/CWG/issues/1109.html">1109</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>When is &#8220;use&#8221; a reference to the ODR meaning?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1110">
     <td><a href="https://cplusplus.github.io/CWG/issues/1110.html">1110</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Incomplete return type should be allowed in <TT>decltype</TT> operand</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1111">
     <td><a href="https://cplusplus.github.io/CWG/issues/1111.html">1111</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>C++11</td>
     <td>Remove dual-scope lookup of member template names</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1112">
     <td><a href="https://cplusplus.github.io/CWG/issues/1112.html">1112</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> variables should have internal linkage like <TT>const</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1113">
     <td><a href="https://cplusplus.github.io/CWG/issues/1113.html">1113</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++11</td>
     <td>Linkage of namespace member of unnamed namespace</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1114">
     <td><a href="https://cplusplus.github.io/CWG/issues/1114.html">1114</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>C++11</td>
     <td>Incorrect use of placement <TT>new</TT> in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1115">
     <td><a href="https://cplusplus.github.io/CWG/issues/1115.html">1115</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>C-compatible alignment specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1116">
     <td><a href="https://cplusplus.github.io/CWG/issues/1116.html">1116</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Aliasing of union members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1117">
     <td><a href="https://cplusplus.github.io/CWG/issues/1117.html">1117</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++11</td>
     <td>Incorrect note about xvalue member access expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1118">
     <td><a href="https://cplusplus.github.io/CWG/issues/1118.html">1118</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Implicit lambda capture via explicit copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1119">
     <td><a href="https://cplusplus.github.io/CWG/issues/1119.html">1119</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>C++11</td>
     <td>Missing case in description of member access ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1120">
     <td><a href="https://cplusplus.github.io/CWG/issues/1120.html">1120</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>C++11</td>
     <td><TT>reinterpret_cast</TT> and <TT>void*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1121">
     <td><a href="https://cplusplus.github.io/CWG/issues/1121.html">1121</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>C++11</td>
     <td>Unnecessary ambiguity error in formation of pointer to member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1122">
     <td><a href="https://cplusplus.github.io/CWG/issues/1122.html">1122</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>C++11</td>
     <td>Circular definition of <TT>std::size_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1123">
     <td><a href="https://cplusplus.github.io/CWG/issues/1123.html">1123</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>C++11</td>
     <td>Destructors should be <TT>noexcept</TT> by default</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1124">
     <td><a href="https://cplusplus.github.io/CWG/issues/1124.html">1124</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>NAD</td>
     <td>Error in description of value category of pointer-to-member expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1125">
     <td><a href="https://cplusplus.github.io/CWG/issues/1125.html">1125</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Unclear definition of &#8220;potential constant expression&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1126">
     <td><a href="https://cplusplus.github.io/CWG/issues/1126.html">1126</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> functions in <TT>const</TT> initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1127">
     <td><a href="https://cplusplus.github.io/CWG/issues/1127.html">1127</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Overload resolution in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1128">
     <td><a href="https://cplusplus.github.io/CWG/issues/1128.html">1128</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec">dcl.spec</a>]</td>
     <td>C++11</td>
     <td><I>attribute-specifier</I>s in <I>decl-specifier-seq</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1129">
     <td><a href="https://cplusplus.github.io/CWG/issues/1129.html">1129</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Default <TT>nothrow</TT> for <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1130">
     <td><a href="https://cplusplus.github.io/CWG/issues/1130.html">1130</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Function parameter type adjustments and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1131">
     <td><a href="https://cplusplus.github.io/CWG/issues/1131.html">1131</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>C++11</td>
     <td>Template aliases in <I>elaborated-type-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1132">
     <td><a href="https://cplusplus.github.io/CWG/issues/1132.html">1132</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.noreturn">dcl.attr.noreturn</a>]</td>
     <td>NAD</td>
     <td>Keyword vs attribute for <TT>noreturn</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1133">
     <td><a href="https://cplusplus.github.io/CWG/issues/1133.html">1133</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.override">dcl.attr.override</a>]</td>
     <td>C++11</td>
     <td>Keywords vs attributes for control of hiding and overriding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1134">
     <td><a href="https://cplusplus.github.io/CWG/issues/1134.html">1134</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>When is an explicitly-defaulted function defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1135">
     <td><a href="https://cplusplus.github.io/CWG/issues/1135.html">1135</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted non-public special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1136">
     <td><a href="https://cplusplus.github.io/CWG/issues/1136.html">1136</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted explicit constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1137">
     <td><a href="https://cplusplus.github.io/CWG/issues/1137.html">1137</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++11</td>
     <td>Explicitly-defaulted virtual special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1138">
     <td><a href="https://cplusplus.github.io/CWG/issues/1138.html">1138</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Rvalue-ness check for rvalue reference binding is wrong</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1139">
     <td><a href="https://cplusplus.github.io/CWG/issues/1139.html">1139</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Rvalue reference binding to scalar xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1140">
     <td><a href="https://cplusplus.github.io/CWG/issues/1140.html">1140</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Incorrect redefinition of POD class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1141">
     <td><a href="https://cplusplus.github.io/CWG/issues/1141.html">1141</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td>Non-static data member initializers have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1142">
     <td><a href="https://cplusplus.github.io/CWG/issues/1142.html">1142</a></td>
+    <td>[<a href="https://wg21.link/class.mfct">class.mfct</a>]</td>
     <td>C++11</td>
     <td><TT>friend</TT> declaration of member function of containing class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1143">
     <td><a href="https://cplusplus.github.io/CWG/issues/1143.html">1143</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>NAD</td>
     <td>Move semantics for <TT>*this</TT> have not been implemented</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1144">
     <td><a href="https://cplusplus.github.io/CWG/issues/1144.html">1144</a></td>
+    <td>[<a href="https://wg21.link/class.access.dcl">class.access.dcl</a>]</td>
     <td>C++11</td>
     <td>Remove access declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1145">
     <td><a href="https://cplusplus.github.io/CWG/issues/1145.html">1145</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td>Defaulting and triviality</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1146">
     <td><a href="https://cplusplus.github.io/CWG/issues/1146.html">1146</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td><I>exception-specification</I>s of defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1147">
     <td><a href="https://cplusplus.github.io/CWG/issues/1147.html">1147</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Destructors should be default <TT>nothrow</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1148">
     <td><a href="https://cplusplus.github.io/CWG/issues/1148.html">1148</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>C++11</td>
     <td>Copy elision and move construction of function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1149">
     <td><a href="https://cplusplus.github.io/CWG/issues/1149.html">1149</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td>Trivial non-public copy operators in subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1150">
     <td><a href="https://cplusplus.github.io/CWG/issues/1150.html">1150</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>NAD</td>
     <td>Inheriting constructors have not been implemented</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1151">
     <td><a href="https://cplusplus.github.io/CWG/issues/1151.html">1151</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>C++11</td>
     <td>Overload resolution with initializer-list and non-list constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1152">
     <td><a href="https://cplusplus.github.io/CWG/issues/1152.html">1152</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>C++11</td>
     <td>Rules for determining existence of implicit conversion sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1153">
     <td><a href="https://cplusplus.github.io/CWG/issues/1153.html">1153</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>C++11</td>
     <td>Type matching in address of overloaded function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1154">
     <td><a href="https://cplusplus.github.io/CWG/issues/1154.html">1154</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Address of <TT>thread_local</TT> variable as non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1155">
     <td><a href="https://cplusplus.github.io/CWG/issues/1155.html">1155</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++11</td>
     <td>Internal-linkage non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1156">
     <td><a href="https://cplusplus.github.io/CWG/issues/1156.html">1156</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>Partial ordering in a non-call context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1157">
     <td><a href="https://cplusplus.github.io/CWG/issues/1157.html">1157</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Partial ordering of function templates is still underspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1158">
     <td><a href="https://cplusplus.github.io/CWG/issues/1158.html">1158</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Recursive instantiation via alias template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1159">
     <td><a href="https://cplusplus.github.io/CWG/issues/1159.html">1159</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>C++11</td>
     <td>Class and enumeration definitions in template aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1160">
     <td><a href="https://cplusplus.github.io/CWG/issues/1160.html">1160</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++11</td>
     <td>Definitions of template members and the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1161">
     <td><a href="https://cplusplus.github.io/CWG/issues/1161.html">1161</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>C++11</td>
     <td>Dependent <I>nested-name-specifier</I> in a pointer-to-member declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1162">
     <td><a href="https://cplusplus.github.io/CWG/issues/1162.html">1162</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Dependent <I>elaborated-type-specifier</I>s in non-deduced contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1163">
     <td><a href="https://cplusplus.github.io/CWG/issues/1163.html">1163</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td><TT>extern template</TT> prevents inlining functions not marked <TT>inline</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1164">
     <td><a href="https://cplusplus.github.io/CWG/issues/1164.html">1164</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>C++11</td>
     <td>Partial ordering of <TT>f(T&amp;)</TT> and <TT>f(T&amp;&amp;)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1165">
     <td><a href="https://cplusplus.github.io/CWG/issues/1165.html">1165</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++11</td>
     <td>Exceptions when destroying array elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1166">
     <td><a href="https://cplusplus.github.io/CWG/issues/1166.html">1166</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td><I>exception-declaration</I>s that do not declare objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1167">
     <td><a href="https://cplusplus.github.io/CWG/issues/1167.html">1167</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td><I>function-try-block</I>s for destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1168">
     <td><a href="https://cplusplus.github.io/CWG/issues/1168.html">1168</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>C++11</td>
     <td>Additional reasons to call <TT>std::terminate</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1169">
     <td><a href="https://cplusplus.github.io/CWG/issues/1169.html">1169</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++11</td>
     <td>Missing feature macro for strict pointer safety</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1170">
     <td><a href="https://cplusplus.github.io/CWG/issues/1170.html">1170</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>C++11</td>
     <td>Access checking during template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1171">
     <td><a href="https://cplusplus.github.io/CWG/issues/1171.html">1171</a></td>
+    <td>[<a href="https://wg21.link/except.terminate">except.terminate</a>]</td>
     <td>C++11</td>
     <td>Partial stack unwinding with <TT>noexcept</TT> violation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1172">
     <td><a href="https://cplusplus.github.io/CWG/issues/1172.html">1172</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>&#8220;instantiation-dependent&#8221; constructs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1173">
     <td><a href="https://cplusplus.github.io/CWG/issues/1173.html">1173</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++11</td>
     <td>Unclear specification of effects of signal handling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1174">
     <td><a href="https://cplusplus.github.io/CWG/issues/1174.html">1174</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>When is a pure virtual function &#8220;used?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1175">
     <td><a href="https://cplusplus.github.io/CWG/issues/1175.html">1175</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Disambiguating user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1176">
     <td><a href="https://cplusplus.github.io/CWG/issues/1176.html">1176</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++11</td>
     <td>Definition of release sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1177">
     <td><a href="https://cplusplus.github.io/CWG/issues/1177.html">1177</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++11</td>
     <td>Intra-thread dependency-ordered-before</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1178">
     <td><a href="https://cplusplus.github.io/CWG/issues/1178.html">1178</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.decl">temp.deduct.decl</a>]</td>
     <td>C++11</td>
     <td>Deduction failure matching placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1179">
     <td><a href="https://cplusplus.github.io/CWG/issues/1179.html">1179</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Cv-qualification of non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1180">
     <td><a href="https://cplusplus.github.io/CWG/issues/1180.html">1180</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>C++11</td>
     <td>Over-aligned class types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1181">
     <td><a href="https://cplusplus.github.io/CWG/issues/1181.html">1181</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>What is a &#8220;built-in type?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1182">
     <td><a href="https://cplusplus.github.io/CWG/issues/1182.html">1182</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Incorrect description of pack expansion syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1183">
     <td><a href="https://cplusplus.github.io/CWG/issues/1183.html">1183</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++11</td>
     <td>Expansion of parameter packs in declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1184">
     <td><a href="https://cplusplus.github.io/CWG/issues/1184.html">1184</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>C++11</td>
     <td>Argument conversions to nondeduced parameter types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1185">
     <td><a href="https://cplusplus.github.io/CWG/issues/1185.html">1185</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++11</td>
     <td>Misleading description of language linkage and member function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1186">
     <td><a href="https://cplusplus.github.io/CWG/issues/1186.html">1186</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Non-dependent <TT>constexpr</TT> violations in function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1187">
     <td><a href="https://cplusplus.github.io/CWG/issues/1187.html">1187</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++11</td>
     <td>Problems in initialization example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1188">
     <td><a href="https://cplusplus.github.io/CWG/issues/1188.html">1188</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Type punning in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1189">
     <td><a href="https://cplusplus.github.io/CWG/issues/1189.html">1189</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>C++11</td>
     <td>Address of distinct base class subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1190">
     <td><a href="https://cplusplus.github.io/CWG/issues/1190.html">1190</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>C++11</td>
     <td>Operations on non-safely-derived pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1191">
     <td><a href="https://cplusplus.github.io/CWG/issues/1191.html">1191</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++11</td>
     <td>Deleted subobject destructors and implicitly-defined constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1192">
     <td><a href="https://cplusplus.github.io/CWG/issues/1192.html">1192</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++11</td>
     <td>Inadvertent change to ODR and templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1193">
     <td><a href="https://cplusplus.github.io/CWG/issues/1193.html">1193</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Use of address-constant pointers in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1194">
     <td><a href="https://cplusplus.github.io/CWG/issues/1194.html">1194</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Constexpr references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1195">
     <td><a href="https://cplusplus.github.io/CWG/issues/1195.html">1195</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>References to non-literal types in constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1196">
     <td><a href="https://cplusplus.github.io/CWG/issues/1196.html">1196</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>C++11</td>
     <td>Definition required for explicit instantiation after explicit specialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1197">
     <td><a href="https://cplusplus.github.io/CWG/issues/1197.html">1197</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++11</td>
     <td>Constexpr arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1198">
     <td><a href="https://cplusplus.github.io/CWG/issues/1198.html">1198</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Literal types and copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1199">
     <td><a href="https://cplusplus.github.io/CWG/issues/1199.html">1199</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td>Deleted constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1200">
     <td><a href="https://cplusplus.github.io/CWG/issues/1200.html">1200</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td>Lookup rules for template parameters</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1201">
     <td><a href="https://cplusplus.github.io/CWG/issues/1201.html">1201</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>C++11</td>
     <td>Are deleted and defaulted functions definitions?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1202">
     <td><a href="https://cplusplus.github.io/CWG/issues/1202.html">1202</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>C++11</td>
     <td>Calling virtual functions during destruction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1203">
     <td><a href="https://cplusplus.github.io/CWG/issues/1203.html">1203</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>dup</td>
     <td>Misleading note regarding initialized static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1204">
     <td><a href="https://cplusplus.github.io/CWG/issues/1204.html">1204</a></td>
+    <td>[<a href="https://wg21.link/stmt.iter">stmt.iter</a>]</td>
     <td>C++11</td>
     <td>Specifiers in a <I>for-range-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1205">
     <td><a href="https://cplusplus.github.io/CWG/issues/1205.html">1205</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>dup</td>
     <td>Lvalue reference binding and function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1206">
     <td><a href="https://cplusplus.github.io/CWG/issues/1206.html">1206</a></td>
+    <td>[<a href="https://wg21.link/temp.class">temp.class</a>]</td>
     <td>C++11</td>
     <td>Defining opaque enumeration members of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1207">
     <td><a href="https://cplusplus.github.io/CWG/issues/1207.html">1207</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Type of class member in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1208">
     <td><a href="https://cplusplus.github.io/CWG/issues/1208.html">1208</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>C++11</td>
     <td>Explicit <TT>noexcept</TT> in defaulted definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1209">
     <td><a href="https://cplusplus.github.io/CWG/issues/1209.html">1209</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Is a potentially-evaluated expression in a template definition a &#8220;use?&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1210">
     <td><a href="https://cplusplus.github.io/CWG/issues/1210.html">1210</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++11</td>
     <td>Injection of <I>elaborated-type-specifier</I> in enumeration scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1211">
     <td><a href="https://cplusplus.github.io/CWG/issues/1211.html">1211</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>open</td>
     <td>Misaligned lvalues</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1212">
     <td><a href="https://cplusplus.github.io/CWG/issues/1212.html">1212</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>C++11</td>
     <td>Non-function-call xvalues and <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1213">
     <td><a href="https://cplusplus.github.io/CWG/issues/1213.html">1213</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>CD3</td>
     <td>Array subscripting and xvalues</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1214">
     <td><a href="https://cplusplus.github.io/CWG/issues/1214.html">1214</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++11</td>
     <td>Kinds of initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1215">
     <td><a href="https://cplusplus.github.io/CWG/issues/1215.html">1215</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>C++11</td>
     <td>Definition of POD struct</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1216">
     <td><a href="https://cplusplus.github.io/CWG/issues/1216.html">1216</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++11</td>
     <td>Exceptions &#8220;allowed&#8221; by a <I>noexcept-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1217">
     <td><a href="https://cplusplus.github.io/CWG/issues/1217.html">1217</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.delete">dcl.fct.def.delete</a>]</td>
     <td>NAD</td>
     <td>Are deleted functions implicitly <TT>noexcept</TT>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1218">
     <td><a href="https://cplusplus.github.io/CWG/issues/1218.html">1218</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++11</td>
     <td>What is the &#8220;currently-handled exception&#8221; in a multi-threaded program?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1219">
     <td><a href="https://cplusplus.github.io/CWG/issues/1219.html">1219</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++11</td>
     <td>Non-static data member initializers in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1220">
     <td><a href="https://cplusplus.github.io/CWG/issues/1220.html">1220</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>C++11</td>
     <td>Looking up <I>conversion-type-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1221">
     <td><a href="https://cplusplus.github.io/CWG/issues/1221.html">1221</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>open</td>
     <td>Partial ordering and reference collapsing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1222">
     <td><a href="https://cplusplus.github.io/CWG/issues/1222.html">1222</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Unnecessary restriction on <TT>auto</TT> array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1223">
     <td><a href="https://cplusplus.github.io/CWG/issues/1223.html">1223</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>CD7</td>
     <td>Syntactic disambiguation and <I>trailing-return-type</I>s</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="1224">
     <td><a href="https://cplusplus.github.io/CWG/issues/1224.html">1224</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> defaulted copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1225">
     <td><a href="https://cplusplus.github.io/CWG/issues/1225.html">1225</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> constructors and virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1226">
     <td><a href="https://cplusplus.github.io/CWG/issues/1226.html">1226</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD3</td>
     <td>Converting a <I>braced-init-list</I> default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1227">
     <td><a href="https://cplusplus.github.io/CWG/issues/1227.html">1227</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Mixing immediate and non-immediate contexts in deduction failure</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1228">
     <td><a href="https://cplusplus.github.io/CWG/issues/1228.html">1228</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>NAD</td>
     <td>Copy-list-initialization and <TT>explicit</TT> constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1229">
     <td><a href="https://cplusplus.github.io/CWG/issues/1229.html">1229</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>C++11</td>
     <td>Overload resolution with empty <I>braced-init-list</I> argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1230">
     <td><a href="https://cplusplus.github.io/CWG/issues/1230.html">1230</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>dup</td>
     <td>Confusing description of ambiguity of destructor name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1231">
     <td><a href="https://cplusplus.github.io/CWG/issues/1231.html">1231</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++11</td>
     <td>Variadic templates requiring an empty pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1232">
     <td><a href="https://cplusplus.github.io/CWG/issues/1232.html">1232</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++11</td>
     <td>Creation of array temporaries using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1233">
     <td><a href="https://cplusplus.github.io/CWG/issues/1233.html">1233</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>C++11</td>
     <td>Pack expansions and dependent calls</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1234">
     <td><a href="https://cplusplus.github.io/CWG/issues/1234.html">1234</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>C++11</td>
     <td><I>abstract-declarator</I> does not permit <TT>...</TT> after <I>ptr-operator</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1235">
     <td><a href="https://cplusplus.github.io/CWG/issues/1235.html">1235</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++11</td>
     <td>&#8220;Unused&#8221; ellipsis and default arguments in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1236">
     <td><a href="https://cplusplus.github.io/CWG/issues/1236.html">1236</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++11</td>
     <td>Inconsistently-interrelated examples</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1237">
     <td><a href="https://cplusplus.github.io/CWG/issues/1237.html">1237</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>C++11</td>
     <td>Deprecated implicit copy assignment in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1238">
     <td><a href="https://cplusplus.github.io/CWG/issues/1238.html">1238</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>C++11</td>
     <td>Overloading ambiguity binding reference to function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1239">
     <td><a href="https://cplusplus.github.io/CWG/issues/1239.html">1239</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>C++11</td>
     <td>Hexadecimal floating-point literals vs user-defined literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1240">
     <td><a href="https://cplusplus.github.io/CWG/issues/1240.html">1240</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>C++11</td>
     <td><TT>constexpr</TT> defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1241">
     <td><a href="https://cplusplus.github.io/CWG/issues/1241.html">1241</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>C++11</td>
     <td>Which members does a destructor destroy?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1242">
     <td><a href="https://cplusplus.github.io/CWG/issues/1242.html">1242</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++11</td>
     <td>Initializing variant class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1243">
     <td><a href="https://cplusplus.github.io/CWG/issues/1243.html">1243</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>C++11</td>
     <td>Misleading footnote regarding multiple-declarator declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1244">
     <td><a href="https://cplusplus.github.io/CWG/issues/1244.html">1244</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>C++11</td>
     <td>Equivalence of alias templates and class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1245">
     <td><a href="https://cplusplus.github.io/CWG/issues/1245.html">1245</a></td>
+    <td>[<a href="https://wg21.link/temp.mem.func">temp.mem.func</a>]</td>
     <td>C++11</td>
     <td>Matching declarations involving <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1246">
     <td><a href="https://cplusplus.github.io/CWG/issues/1246.html">1246</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++11</td>
     <td>Non-deduced non-final parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1247">
     <td><a href="https://cplusplus.github.io/CWG/issues/1247.html">1247</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD4</td>
     <td>Restriction on alias name appearing in <I>type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1248">
     <td><a href="https://cplusplus.github.io/CWG/issues/1248.html">1248</a></td>
+    <td>[<a href="https://wg21.link/diff.iso">diff.iso</a>]</td>
     <td>open</td>
     <td>Updating Annex C to C99 and C23</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1249">
     <td><a href="https://cplusplus.github.io/CWG/issues/1249.html">1249</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of nested lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1250">
     <td><a href="https://cplusplus.github.io/CWG/issues/1250.html">1250</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD3</td>
     <td>Cv-qualification of incomplete virtual function return types</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1251">
     <td><a href="https://cplusplus.github.io/CWG/issues/1251.html">1251</a></td>
+    <td>[<a href="https://wg21.link/diff.conv">diff.conv</a>]</td>
     <td>CD3</td>
     <td>C compatibility: casting to unqualified <TT>void*</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1252">
     <td><a href="https://cplusplus.github.io/CWG/issues/1252.html">1252</a></td>
+    <td>[<a href="https://wg21.link/over.load">over.load</a>]</td>
     <td>CD6</td>
     <td>Overloading member function templates based on dependent return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1253">
     <td><a href="https://cplusplus.github.io/CWG/issues/1253.html">1253</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>C++17</td>
     <td>Generic non-template members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1254">
     <td><a href="https://cplusplus.github.io/CWG/issues/1254.html">1254</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>odr-use vs template arguments and constexpr functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1255">
     <td><a href="https://cplusplus.github.io/CWG/issues/1255.html">1255</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Definition problems with <TT>constexpr</TT> functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1256">
     <td><a href="https://cplusplus.github.io/CWG/issues/1256.html">1256</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Unevaluated operands are not necessarily constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1257">
     <td><a href="https://cplusplus.github.io/CWG/issues/1257.html">1257</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>open</td>
     <td>Instantiation via non-dependent references in uninstantiated templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1258">
     <td><a href="https://cplusplus.github.io/CWG/issues/1258.html">1258</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>CD5</td>
     <td>&#8220;Instantiation context&#8221; differs from dependent lookup rules</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1259">
     <td><a href="https://cplusplus.github.io/CWG/issues/1259.html">1259</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Deleting a POD via a pointer to base</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1260">
     <td><a href="https://cplusplus.github.io/CWG/issues/1260.html">1260</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Incorrect use of term &#8220;overloaded&#8221; in description of odr-use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1261">
     <td><a href="https://cplusplus.github.io/CWG/issues/1261.html">1261</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD3</td>
     <td>Explicit handling of cv-qualification with non-class prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1262">
     <td><a href="https://cplusplus.github.io/CWG/issues/1262.html">1262</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Default template arguments and deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1263">
     <td><a href="https://cplusplus.github.io/CWG/issues/1263.html">1263</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Mismatch between rvalue reference binding and overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1264">
     <td><a href="https://cplusplus.github.io/CWG/issues/1264.html">1264</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Use of <TT>this</TT> in <TT>constexpr</TT> constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1265">
     <td><a href="https://cplusplus.github.io/CWG/issues/1265.html">1265</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Mixed use of the <TT>auto</TT> specifier</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr class="open" id="1266">
     <td><a href="https://cplusplus.github.io/CWG/issues/1266.html">1266</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td><I>user-defined-integer-literal</I> overflow</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1267">
     <td><a href="https://cplusplus.github.io/CWG/issues/1267.html">1267</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Rvalue reference types in <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1268">
     <td><a href="https://cplusplus.github.io/CWG/issues/1268.html">1268</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD3</td>
     <td><TT>reinterpret_cast</TT> of an xvalue operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1269">
     <td><a href="https://cplusplus.github.io/CWG/issues/1269.html">1269</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD3</td>
     <td><TT>dynamic_cast</TT> of an xvalue operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1270">
     <td><a href="https://cplusplus.github.io/CWG/issues/1270.html">1270</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Brace elision in array temporary initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1271">
     <td><a href="https://cplusplus.github.io/CWG/issues/1271.html">1271</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD5</td>
     <td>Imprecise wording regarding dependent types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1272">
     <td><a href="https://cplusplus.github.io/CWG/issues/1272.html">1272</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>Implicit definition of static data member of const literal type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1273">
     <td><a href="https://cplusplus.github.io/CWG/issues/1273.html">1273</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Accessibility and function signatures</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1274">
     <td><a href="https://cplusplus.github.io/CWG/issues/1274.html">1274</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD4</td>
     <td>Common nonterminal for <I>expression</I> and <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1275">
     <td><a href="https://cplusplus.github.io/CWG/issues/1275.html">1275</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD3</td>
     <td>Incorrect comment in example of template parameter pack restriction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1276">
     <td><a href="https://cplusplus.github.io/CWG/issues/1276.html">1276</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Reference to <TT>stdint.h</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1277">
     <td><a href="https://cplusplus.github.io/CWG/issues/1277.html">1277</a></td>
+    <td>[<a href="https://wg21.link/cstdint.syn">cstdint.syn</a>]</td>
     <td>NAD</td>
     <td>Lax definition of <TT>intmax_t</TT> and <TT>uintmax_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1278">
     <td><a href="https://cplusplus.github.io/CWG/issues/1278.html">1278</a></td>
+    <td>[<a href="https://wg21.link/over.call.func">over.call.func</a>]</td>
     <td>drafting</td>
     <td>Incorrect treatment of contrived object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1279">
     <td><a href="https://cplusplus.github.io/CWG/issues/1279.html">1279</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03">diff.cpp03</a>]</td>
     <td>open</td>
     <td>Additional differences between C++ 2003 and C++ 2011</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1280">
     <td><a href="https://cplusplus.github.io/CWG/issues/1280.html">1280</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Object reallocation and reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1281">
     <td><a href="https://cplusplus.github.io/CWG/issues/1281.html">1281</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Virtual and dependent base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1282">
     <td><a href="https://cplusplus.github.io/CWG/issues/1282.html">1282</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Underspecified destructor <I>exception-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1283">
     <td><a href="https://cplusplus.github.io/CWG/issues/1283.html">1283</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>open</td>
     <td>Static data members of classes with typedef name for linkage purposes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1284">
     <td><a href="https://cplusplus.github.io/CWG/issues/1284.html">1284</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Should the lifetime of an array be independent of that of its elements?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1285">
     <td><a href="https://cplusplus.github.io/CWG/issues/1285.html">1285</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Trivial destructors and object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1286">
     <td><a href="https://cplusplus.github.io/CWG/issues/1286.html">1286</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Equivalence of alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1287">
     <td><a href="https://cplusplus.github.io/CWG/issues/1287.html">1287</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++14</td>
     <td>Direct initialization vs &#8220;implicit&#8221; conversion in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1288">
     <td><a href="https://cplusplus.github.io/CWG/issues/1288.html">1288</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Reference list initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1289">
     <td><a href="https://cplusplus.github.io/CWG/issues/1289.html">1289</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Can an alias template name the current instantiation?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1290">
     <td><a href="https://cplusplus.github.io/CWG/issues/1290.html">1290</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Lifetime of the underlying array of an <TT>initializer_list</TT> member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1291">
     <td><a href="https://cplusplus.github.io/CWG/issues/1291.html">1291</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Looking up a <I>conversion-type-id</I></td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="1292">
     <td><a href="https://cplusplus.github.io/CWG/issues/1292.html">1292</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>Dependent calls with <I>braced-init-list</I>s containing a pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1293">
     <td><a href="https://cplusplus.github.io/CWG/issues/1293.html">1293</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>String literals in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1294">
     <td><a href="https://cplusplus.github.io/CWG/issues/1294.html">1294</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Side effects in dynamic/static initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1295">
     <td><a href="https://cplusplus.github.io/CWG/issues/1295.html">1295</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Binding a reference to an rvalue bit-field</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1296">
     <td><a href="https://cplusplus.github.io/CWG/issues/1296.html">1296</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD3</td>
     <td>Ill-formed template declarations (not just definitions)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1297">
     <td><a href="https://cplusplus.github.io/CWG/issues/1297.html">1297</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Misplaced function <I>attribute-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1298">
     <td><a href="https://cplusplus.github.io/CWG/issues/1298.html">1298</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Incorrect example in overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1299">
     <td><a href="https://cplusplus.github.io/CWG/issues/1299.html">1299</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD5</td>
     <td>&#8220;Temporary objects&#8221; vs &#8220;temporary expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1300">
     <td><a href="https://cplusplus.github.io/CWG/issues/1300.html">1300</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>dup</td>
     <td><TT>T()</TT> for array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1301">
     <td><a href="https://cplusplus.github.io/CWG/issues/1301.html">1301</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization of union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1302">
     <td><a href="https://cplusplus.github.io/CWG/issues/1302.html">1302</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td><TT>noexcept</TT> applied to expression of type <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1303">
     <td><a href="https://cplusplus.github.io/CWG/issues/1303.html">1303</a></td>
+    <td>[<a href="https://wg21.link/temp">temp</a>]</td>
     <td>NAD</td>
     <td>C language linkage for template with internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1304">
     <td><a href="https://cplusplus.github.io/CWG/issues/1304.html">1304</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.string">dcl.init.string</a>]</td>
     <td>drafting</td>
     <td>Omitted array bound with string initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1305">
     <td><a href="https://cplusplus.github.io/CWG/issues/1305.html">1305</a></td>
+    <td>[<a href="https://wg21.link/expr.alignof">expr.alignof</a>]</td>
     <td>CD3</td>
     <td><TT>alignof</TT> applied to array of unknown size</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1306">
     <td><a href="https://cplusplus.github.io/CWG/issues/1306.html">1306</a></td>
+    <td>[<a href="https://wg21.link/class.this">class.this</a>]</td>
     <td>CD3</td>
     <td>Modifying an object within a <TT>const</TT> member function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1307">
     <td><a href="https://cplusplus.github.io/CWG/issues/1307.html">1307</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>C++14</td>
     <td>Overload resolution based on size of array <I>initializer-list</I></td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1308">
     <td><a href="https://cplusplus.github.io/CWG/issues/1308.html">1308</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td>Completeness of class type within an <I>exception-specification</I></td>
     <td class="full-superseded" align="center">Superseded by <a href="#1330">1330</a></td>
   </tr>
   <tr id="1309">
     <td><a href="https://cplusplus.github.io/CWG/issues/1309.html">1309</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Incorrect note regarding lookup of a member of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1310">
     <td><a href="https://cplusplus.github.io/CWG/issues/1310.html">1310</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD3</td>
     <td>What is an &#8220;acceptable lookup result?&#8221;</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="1311">
     <td><a href="https://cplusplus.github.io/CWG/issues/1311.html">1311</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Volatile lvalues in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1312">
     <td><a href="https://cplusplus.github.io/CWG/issues/1312.html">1312</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Simulated <TT>reinterpret_cast</TT> in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1313">
     <td><a href="https://cplusplus.github.io/CWG/issues/1313.html">1313</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Undefined pointer arithmetic in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1314">
     <td><a href="https://cplusplus.github.io/CWG/issues/1314.html">1314</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Pointer arithmetic within standard-layout objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1315">
     <td><a href="https://cplusplus.github.io/CWG/issues/1315.html">1315</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Restrictions on non-type template arguments in partial specializations</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1316">
     <td><a href="https://cplusplus.github.io/CWG/issues/1316.html">1316</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> function requirements and class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1317">
     <td><a href="https://cplusplus.github.io/CWG/issues/1317.html">1317</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>Unnamed scoped enumerations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1318">
     <td><a href="https://cplusplus.github.io/CWG/issues/1318.html">1318</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>Syntactic ambiguities with <TT>final</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1319">
     <td><a href="https://cplusplus.github.io/CWG/issues/1319.html">1319</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Error in pack expansion example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1320">
     <td><a href="https://cplusplus.github.io/CWG/issues/1320.html">1320</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td>Converting scoped enumerations to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1321">
     <td><a href="https://cplusplus.github.io/CWG/issues/1321.html">1321</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>CD3</td>
     <td>Equivalency of dependent calls</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1322">
     <td><a href="https://cplusplus.github.io/CWG/issues/1322.html">1322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>Function parameter type decay in templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1323">
     <td><a href="https://cplusplus.github.io/CWG/issues/1323.html">1323</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Nonexistent nonterminal in <I>alignment-specifier</I> grammar</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1324">
     <td><a href="https://cplusplus.github.io/CWG/issues/1324.html">1324</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization and defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1325">
     <td><a href="https://cplusplus.github.io/CWG/issues/1325.html">1325</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Omitted declarator in <TT>friend</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1326">
     <td><a href="https://cplusplus.github.io/CWG/issues/1326.html">1326</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>dup</td>
     <td>Deducing an array bound from an <I>initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1327">
     <td><a href="https://cplusplus.github.io/CWG/issues/1327.html">1327</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td><I>virt-specifier</I> in a defaulted definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1328">
     <td><a href="https://cplusplus.github.io/CWG/issues/1328.html">1328</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Conflict in reference binding vs overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1329">
     <td><a href="https://cplusplus.github.io/CWG/issues/1329.html">1329</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD3</td>
     <td>Recursive deduction substitutions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1330">
     <td><a href="https://cplusplus.github.io/CWG/issues/1330.html">1330</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Delayed instantiation of <TT>noexcept</TT> specifiers</td>
     <td class="full" align="center">Clang 4 (C++11 onwards)</td>
   </tr>
   <tr id="1331">
     <td><a href="https://cplusplus.github.io/CWG/issues/1331.html">1331</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td><TT>const</TT> mismatch with defaulted copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1332">
     <td><a href="https://cplusplus.github.io/CWG/issues/1332.html">1332</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD5</td>
     <td>Handling of invalid universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1333">
     <td><a href="https://cplusplus.github.io/CWG/issues/1333.html">1333</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td>Omission of <TT>const</TT> in a defaulted copy constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1334">
     <td><a href="https://cplusplus.github.io/CWG/issues/1334.html">1334</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>NAD</td>
     <td>Layout compatibility and cv-qualification</td>
     <td class="full-superseded" align="center">Superseded by <a href="#1719">1719</a></td>
   </tr>
   <tr id="1335">
     <td><a href="https://cplusplus.github.io/CWG/issues/1335.html">1335</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD6</td>
     <td>Stringizing, extended characters, and universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1336">
     <td><a href="https://cplusplus.github.io/CWG/issues/1336.html">1336</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;converting constructor&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1337">
     <td><a href="https://cplusplus.github.io/CWG/issues/1337.html">1337</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>dup</td>
     <td>Partial ordering and non-deduced parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1338">
     <td><a href="https://cplusplus.github.io/CWG/issues/1338.html">1338</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD4</td>
     <td>Aliasing and allocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1339">
     <td><a href="https://cplusplus.github.io/CWG/issues/1339.html">1339</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Parenthesized <I>braced-init-list</I> and arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1340">
     <td><a href="https://cplusplus.github.io/CWG/issues/1340.html">1340</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>CD3</td>
     <td>Complete type in member pointer expressions</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1341">
     <td><a href="https://cplusplus.github.io/CWG/issues/1341.html">1341</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td>Bit-field initializers</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P0683R1">P0683R1</a></td>
   </tr>
   <tr id="1342">
     <td><a href="https://cplusplus.github.io/CWG/issues/1342.html">1342</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD6</td>
     <td>Order of initialization with multiple declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1343">
     <td><a href="https://cplusplus.github.io/CWG/issues/1343.html">1343</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++17</td>
     <td>Sequencing of non-class initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1344">
     <td><a href="https://cplusplus.github.io/CWG/issues/1344.html">1344</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Adding new special member functions to a class via default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1345">
     <td><a href="https://cplusplus.github.io/CWG/issues/1345.html">1345</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD3</td>
     <td>Initialization of anonymous union class members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1346">
     <td><a href="https://cplusplus.github.io/CWG/issues/1346.html">1346</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td><I>expression-list</I> initializers and the <TT>auto</TT> specifier</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1347">
     <td><a href="https://cplusplus.github.io/CWG/issues/1347.html">1347</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Consistency of <TT>auto</TT> in multiple-declarator declarations</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="1348">
     <td><a href="https://cplusplus.github.io/CWG/issues/1348.html">1348</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>drafting</td>
     <td>Use of <TT>auto</TT> in a <I>trailing-return-type</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1349">
     <td><a href="https://cplusplus.github.io/CWG/issues/1349.html">1349</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>dup</td>
     <td>Consistency of alias template redeclarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1350">
     <td><a href="https://cplusplus.github.io/CWG/issues/1350.html">1350</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD3</td>
     <td>Incorrect exception specification for inherited constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1351">
     <td><a href="https://cplusplus.github.io/CWG/issues/1351.html">1351</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Problems with implicitly-declared <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1352">
     <td><a href="https://cplusplus.github.io/CWG/issues/1352.html">1352</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD3</td>
     <td>Inconsistent class scope and completeness rules</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1353">
     <td><a href="https://cplusplus.github.io/CWG/issues/1353.html">1353</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD7</td>
     <td>Array and variant members and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1354">
     <td><a href="https://cplusplus.github.io/CWG/issues/1354.html">1354</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD3</td>
     <td>Destructor exceptions for temporaries in noexcept expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1355">
     <td><a href="https://cplusplus.github.io/CWG/issues/1355.html">1355</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD3</td>
     <td>Aggregates and &#8220;user-provided&#8221; constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1356">
     <td><a href="https://cplusplus.github.io/CWG/issues/1356.html">1356</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Exception specifications of copy assignment operators with virtual bases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1357">
     <td><a href="https://cplusplus.github.io/CWG/issues/1357.html">1357</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td><I>brace-or-equal-initializer</I>s for function and typedef members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1358">
     <td><a href="https://cplusplus.github.io/CWG/issues/1358.html">1358</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Unintentionally ill-formed <TT>constexpr</TT> function template instances</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1359">
     <td><a href="https://cplusplus.github.io/CWG/issues/1359.html">1359</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> union constructors</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1360">
     <td><a href="https://cplusplus.github.io/CWG/issues/1360.html">1360</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD6</td>
     <td><TT>constexpr</TT> defaulted default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1361">
     <td><a href="https://cplusplus.github.io/CWG/issues/1361.html">1361</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Requirement on <I>brace-or-equal-initializer</I>s of literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1362">
     <td><a href="https://cplusplus.github.io/CWG/issues/1362.html">1362</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>Complete type required for implicit conversion to <TT>T&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1363">
     <td><a href="https://cplusplus.github.io/CWG/issues/1363.html">1363</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>Triviality vs multiple default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1364">
     <td><a href="https://cplusplus.github.io/CWG/issues/1364.html">1364</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1365">
     <td><a href="https://cplusplus.github.io/CWG/issues/1365.html">1365</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Calling undefined <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1366">
     <td><a href="https://cplusplus.github.io/CWG/issues/1366.html">1366</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Deleted <TT>constexpr</TT> constructors and virtual base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1367">
     <td><a href="https://cplusplus.github.io/CWG/issues/1367.html">1367</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Use of <TT>this</TT> in a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1368">
     <td><a href="https://cplusplus.github.io/CWG/issues/1368.html">1368</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization and defaulted constructors (part 2)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1369">
     <td><a href="https://cplusplus.github.io/CWG/issues/1369.html">1369</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Function invocation substitution of <TT>this</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1370">
     <td><a href="https://cplusplus.github.io/CWG/issues/1370.html">1370</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>CD3</td>
     <td><I>identifier-list</I> cannot contain ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1371">
     <td><a href="https://cplusplus.github.io/CWG/issues/1371.html">1371</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>NAD</td>
     <td>Deduction from <TT>T&amp;&amp;</TT> in return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1372">
     <td><a href="https://cplusplus.github.io/CWG/issues/1372.html">1372</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD3</td>
     <td>Cross-references incorrect in conversion function template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1373">
     <td><a href="https://cplusplus.github.io/CWG/issues/1373.html">1373</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>dup</td>
     <td>Overload resolution changes matching reference-binding changes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1374">
     <td><a href="https://cplusplus.github.io/CWG/issues/1374.html">1374</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Qualification conversion vs difference in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1375">
     <td><a href="https://cplusplus.github.io/CWG/issues/1375.html">1375</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD3</td>
     <td>Reference to anonymous union?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1376">
     <td><a href="https://cplusplus.github.io/CWG/issues/1376.html">1376</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++14</td>
     <td><TT>static_cast</TT> of temporary to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1377">
     <td><a href="https://cplusplus.github.io/CWG/issues/1377.html">1377</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03">diff.cpp03</a>]</td>
     <td>dup</td>
     <td>Access declarations not mentioned in Annex C</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1378">
     <td><a href="https://cplusplus.github.io/CWG/issues/1378.html">1378</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD5</td>
     <td>When is an instantiation required?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1379">
     <td><a href="https://cplusplus.github.io/CWG/issues/1379.html">1379</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Is <TT>std::initializer_list</TT> an aggregate?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1380">
     <td><a href="https://cplusplus.github.io/CWG/issues/1380.html">1380</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>Type definitions in <I>template-parameter</I> <I>parameter-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1381">
     <td><a href="https://cplusplus.github.io/CWG/issues/1381.html">1381</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD3</td>
     <td>Implicitly-declared special member functions and default <TT>nothrow</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1382">
     <td><a href="https://cplusplus.github.io/CWG/issues/1382.html">1382</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Dead code for constructor names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1383">
     <td><a href="https://cplusplus.github.io/CWG/issues/1383.html">1383</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>CD3</td>
     <td>Clarifying discarded-value expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1384">
     <td><a href="https://cplusplus.github.io/CWG/issues/1384.html">1384</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1385">
     <td><a href="https://cplusplus.github.io/CWG/issues/1385.html">1385</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD3</td>
     <td>Syntactic forms of conversion functions for surrogate call functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1386">
     <td><a href="https://cplusplus.github.io/CWG/issues/1386.html">1386</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicitly-specified partial argument list with multiple parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1387">
     <td><a href="https://cplusplus.github.io/CWG/issues/1387.html">1387</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD3</td>
     <td>Missing non-deduced context for <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1388">
     <td><a href="https://cplusplus.github.io/CWG/issues/1388.html">1388</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD3</td>
     <td>Missing non-deduced context following a function parameter pack</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1389">
     <td><a href="https://cplusplus.github.io/CWG/issues/1389.html">1389</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>NAD</td>
     <td>Recursive reference in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1390">
     <td><a href="https://cplusplus.github.io/CWG/issues/1390.html">1390</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Dependency of alias template specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1391">
     <td><a href="https://cplusplus.github.io/CWG/issues/1391.html">1391</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD4</td>
     <td>Conversions to parameter types with non-deduced template arguments</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1392">
     <td><a href="https://cplusplus.github.io/CWG/issues/1392.html">1392</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>CD3</td>
     <td>Explicit conversion functions for references and non-references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1393">
     <td><a href="https://cplusplus.github.io/CWG/issues/1393.html">1393</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++17</td>
     <td>Pack expansions in <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1394">
     <td><a href="https://cplusplus.github.io/CWG/issues/1394.html">1394</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD3</td>
     <td>Incomplete types as parameters of deleted functions</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr id="1395">
     <td><a href="https://cplusplus.github.io/CWG/issues/1395.html">1395</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++17</td>
     <td>Partial ordering of variadic templates reconsidered</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="1396">
     <td><a href="https://cplusplus.github.io/CWG/issues/1396.html">1396</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>C++23</td>
     <td>Deferred instantiation and checking of non-static data member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1397">
     <td><a href="https://cplusplus.github.io/CWG/issues/1397.html">1397</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Class completeness in non-static data member initializers</td>
     <td class="full" align="center">Clang 3.2</td>
   </tr>
   <tr id="1398">
     <td><a href="https://cplusplus.github.io/CWG/issues/1398.html">1398</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD3</td>
     <td>Non-type template parameters of type <TT>std::nullptr_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1399">
     <td><a href="https://cplusplus.github.io/CWG/issues/1399.html">1399</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD3</td>
     <td>Deduction with multiple function parameter packs</td>
     <td class="full" align="center">Duplicate of <a href="#1388">1388</a></td>
   </tr>
   <tr id="1400">
     <td><a href="https://cplusplus.github.io/CWG/issues/1400.html">1400</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>NAD</td>
     <td>Function pointer equality</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1401">
     <td><a href="https://cplusplus.github.io/CWG/issues/1401.html">1401</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD3</td>
     <td>Similar types and reference compatibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1402">
     <td><a href="https://cplusplus.github.io/CWG/issues/1402.html">1402</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Move functions too often deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1403">
     <td><a href="https://cplusplus.github.io/CWG/issues/1403.html">1403</a></td>
+    <td>[<a href="https://wg21.link/lex.comment">lex.comment</a>]</td>
     <td>CD6</td>
     <td>Universal-character-names in comments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1404">
     <td><a href="https://cplusplus.github.io/CWG/issues/1404.html">1404</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>open</td>
     <td>Object reallocation in unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1405">
     <td><a href="https://cplusplus.github.io/CWG/issues/1405.html">1405</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td><TT>constexpr</TT> and mutable members of literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1406">
     <td><a href="https://cplusplus.github.io/CWG/issues/1406.html">1406</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD3</td>
     <td><I>ref-qualifier</I>s and added parameters of non-static member function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1407">
     <td><a href="https://cplusplus.github.io/CWG/issues/1407.html">1407</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Integral to <TT>bool</TT> conversion in converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1408">
     <td><a href="https://cplusplus.github.io/CWG/issues/1408.html">1408</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>What is &#8220;the same aggregate initialization?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1409">
     <td><a href="https://cplusplus.github.io/CWG/issues/1409.html">1409</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD3</td>
     <td>What is the second standard conversion sequence of a list-initialization sequence?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1410">
     <td><a href="https://cplusplus.github.io/CWG/issues/1410.html">1410</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD3</td>
     <td>Reference overload tiebreakers should apply to rvalue references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1411">
     <td><a href="https://cplusplus.github.io/CWG/issues/1411.html">1411</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD3</td>
     <td>More on global scope <TT>::</TT> in <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1412">
     <td><a href="https://cplusplus.github.io/CWG/issues/1412.html">1412</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td>Problems in specifying pointer conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1413">
     <td><a href="https://cplusplus.github.io/CWG/issues/1413.html">1413</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD3</td>
     <td>Missing cases of value-dependency</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr class="open" id="1414">
     <td><a href="https://cplusplus.github.io/CWG/issues/1414.html">1414</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>drafting</td>
     <td>Binding an rvalue reference to a reference-unrelated lvalue</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1415">
     <td><a href="https://cplusplus.github.io/CWG/issues/1415.html">1415</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD3</td>
     <td>Missing prohibition of block-scope definition of <TT>extern</TT> object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1416">
     <td><a href="https://cplusplus.github.io/CWG/issues/1416.html">1416</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD3</td>
     <td>Function cv-qualifiers and <TT>typeid</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1417">
     <td><a href="https://cplusplus.github.io/CWG/issues/1417.html">1417</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++14</td>
     <td>Pointers/references to functions with cv-qualifiers or <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1418">
     <td><a href="https://cplusplus.github.io/CWG/issues/1418.html">1418</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Type of <TT>initializer_list</TT> backing array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1419">
     <td><a href="https://cplusplus.github.io/CWG/issues/1419.html">1419</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Evaluation order in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1420">
     <td><a href="https://cplusplus.github.io/CWG/issues/1420.html">1420</a></td>
+    <td>[<a href="https://wg21.link/class.abstract">class.abstract</a>]</td>
     <td>NAD</td>
     <td>Abstract final classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1421">
     <td><a href="https://cplusplus.github.io/CWG/issues/1421.html">1421</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Full expressions and aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1422">
     <td><a href="https://cplusplus.github.io/CWG/issues/1422.html">1422</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>dup</td>
     <td>Type of character literals containing universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1423">
     <td><a href="https://cplusplus.github.io/CWG/issues/1423.html">1423</a></td>
+    <td>[<a href="https://wg21.link/conv.fctptr">conv.fctptr</a>]</td>
     <td>CD3</td>
     <td>Convertibility of <TT>nullptr</TT> to <TT>bool</TT></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1424">
     <td><a href="https://cplusplus.github.io/CWG/issues/1424.html">1424</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++14</td>
     <td>When must sub-object destructors be accessible?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1425">
     <td><a href="https://cplusplus.github.io/CWG/issues/1425.html">1425</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD3</td>
     <td>Base-class subobjects of standard-layout structs</td>
     <td class="na" align="center">N/A (ABI constraint)</td>
   </tr>
   <tr id="1426">
     <td><a href="https://cplusplus.github.io/CWG/issues/1426.html">1426</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td>Allowing additional parameter types in defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1427">
     <td><a href="https://cplusplus.github.io/CWG/issues/1427.html">1427</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>NAD</td>
     <td>Default constructor and deleted or inaccessible destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1428">
     <td><a href="https://cplusplus.github.io/CWG/issues/1428.html">1428</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>CD3</td>
     <td>Dynamic const objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1429">
     <td><a href="https://cplusplus.github.io/CWG/issues/1429.html">1429</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.temp">basic.scope.temp</a>]</td>
     <td>NAD</td>
     <td>Scope of a member template's template parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1430">
     <td><a href="https://cplusplus.github.io/CWG/issues/1430.html">1430</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Pack expansion into fixed alias template parameter list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1431">
     <td><a href="https://cplusplus.github.io/CWG/issues/1431.html">1431</a></td>
+    <td>[<a href="https://wg21.link/except">except</a>]</td>
     <td>CD3</td>
     <td>Exceptions from other than <I>throw-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1432">
     <td><a href="https://cplusplus.github.io/CWG/issues/1432.html">1432</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>open</td>
     <td>Newly-ambiguous variadic template expansions</td>
     <td align="center">
@@ -8419,912 +9809,1064 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1433">
     <td><a href="https://cplusplus.github.io/CWG/issues/1433.html">1433</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>NAD</td>
     <td><I>trailing-return-type</I> and point of declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1434">
     <td><a href="https://cplusplus.github.io/CWG/issues/1434.html">1434</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Parenthesized <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1435">
     <td><a href="https://cplusplus.github.io/CWG/issues/1435.html">1435</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD3</td>
     <td><I>template-id</I> as the declarator for a class template constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1436">
     <td><a href="https://cplusplus.github.io/CWG/issues/1436.html">1436</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Interaction of constant expression changes with preprocessor expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1437">
     <td><a href="https://cplusplus.github.io/CWG/issues/1437.html">1437</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD3</td>
     <td><TT>alignas</TT> in <I>alias-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1438">
     <td><a href="https://cplusplus.github.io/CWG/issues/1438.html">1438</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>CD3</td>
     <td>Non-dereference use of invalid pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1439">
     <td><a href="https://cplusplus.github.io/CWG/issues/1439.html">1439</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD3</td>
     <td>Lookup and friend template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1440">
     <td><a href="https://cplusplus.github.io/CWG/issues/1440.html">1440</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD3</td>
     <td>Acceptable <I>decltype-specifier</I>s used as <I>nested-name-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1441">
     <td><a href="https://cplusplus.github.io/CWG/issues/1441.html">1441</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++14</td>
     <td>Unclear wording for signal handler restrictions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1442">
     <td><a href="https://cplusplus.github.io/CWG/issues/1442.html">1442</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD3</td>
     <td>Argument-dependent lookup in the range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1443">
     <td><a href="https://cplusplus.github.io/CWG/issues/1443.html">1443</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>NAD</td>
     <td>Default arguments and non-static data members</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="1444">
     <td><a href="https://cplusplus.github.io/CWG/issues/1444.html">1444</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>Type adjustments of non-type template parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1445">
     <td><a href="https://cplusplus.github.io/CWG/issues/1445.html">1445</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>dup</td>
     <td>Argument-dependent lookup of <TT>begin</TT> and <TT>end</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1446">
     <td><a href="https://cplusplus.github.io/CWG/issues/1446.html">1446</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD4</td>
     <td>Member function with no <I>ref-qualifier</I> and non-member function with rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1447">
     <td><a href="https://cplusplus.github.io/CWG/issues/1447.html">1447</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD3</td>
     <td><TT>static_cast</TT> of bit-field lvalue to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1448">
     <td><a href="https://cplusplus.github.io/CWG/issues/1448.html">1448</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>NAD</td>
     <td>Integral values of type <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1449">
     <td><a href="https://cplusplus.github.io/CWG/issues/1449.html">1449</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Narrowing conversion of negative value to unsigned type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1450">
     <td><a href="https://cplusplus.github.io/CWG/issues/1450.html">1450</a></td>
+    <td>[<a href="https://wg21.link/expr.mul">expr.mul</a>]</td>
     <td>CD3</td>
     <td><TT>INT_MIN % -1</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1451">
     <td><a href="https://cplusplus.github.io/CWG/issues/1451.html">1451</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD4</td>
     <td>Objects with no linkage in non-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1452">
     <td><a href="https://cplusplus.github.io/CWG/issues/1452.html">1452</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Value-initialized objects may be constants</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1453">
     <td><a href="https://cplusplus.github.io/CWG/issues/1453.html">1453</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD3</td>
     <td>Volatile members in literal classes?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1454">
     <td><a href="https://cplusplus.github.io/CWG/issues/1454.html">1454</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Passing constants through <TT>constexpr</TT> functions via references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1455">
     <td><a href="https://cplusplus.github.io/CWG/issues/1455.html">1455</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Lvalue converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1456">
     <td><a href="https://cplusplus.github.io/CWG/issues/1456.html">1456</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Address constant expression designating the one-past-the-end address</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1457">
     <td><a href="https://cplusplus.github.io/CWG/issues/1457.html">1457</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD3</td>
     <td>Undefined behavior in left-shift</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1458">
     <td><a href="https://cplusplus.github.io/CWG/issues/1458.html">1458</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD3</td>
     <td>Address of incomplete type vs <TT>operator&amp;()</TT></td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr class="open" id="1459">
     <td><a href="https://cplusplus.github.io/CWG/issues/1459.html">1459</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Reference-binding tiebreakers in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1460">
     <td><a href="https://cplusplus.github.io/CWG/issues/1460.html">1460</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>C++14</td>
     <td>What is an empty union?</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1461">
     <td><a href="https://cplusplus.github.io/CWG/issues/1461.html">1461</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Narrowing conversions to bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1462">
     <td><a href="https://cplusplus.github.io/CWG/issues/1462.html">1462</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD3</td>
     <td>Deduction failure vs &#8220;ill-formed, no diagnostic required&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1463">
     <td><a href="https://cplusplus.github.io/CWG/issues/1463.html">1463</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>drafting</td>
     <td><TT>extern "C"</TT> alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1464">
     <td><a href="https://cplusplus.github.io/CWG/issues/1464.html">1464</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>Negative array bound in a <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1465">
     <td><a href="https://cplusplus.github.io/CWG/issues/1465.html">1465</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD4</td>
     <td><TT>noexcept</TT> and <TT>std::bad_array_new_length</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1466">
     <td><a href="https://cplusplus.github.io/CWG/issues/1466.html">1466</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++14</td>
     <td>Visible sequences of side effects are redundant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1467">
     <td><a href="https://cplusplus.github.io/CWG/issues/1467.html">1467</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization of aggregate from same-type object</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1468">
     <td><a href="https://cplusplus.github.io/CWG/issues/1468.html">1468</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td><TT>typeid</TT>, overload resolution, and implicit lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1469">
     <td><a href="https://cplusplus.github.io/CWG/issues/1469.html">1469</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Omitted bound in array <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1470">
     <td><a href="https://cplusplus.github.io/CWG/issues/1470.html">1470</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Thread migration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1471">
     <td><a href="https://cplusplus.github.io/CWG/issues/1471.html">1471</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD3</td>
     <td>Nested type of non-dependent base</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1472">
     <td><a href="https://cplusplus.github.io/CWG/issues/1472.html">1472</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td>odr-use of reference variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1473">
     <td><a href="https://cplusplus.github.io/CWG/issues/1473.html">1473</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD3</td>
     <td>Syntax of <I>literal-operator-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1474">
     <td><a href="https://cplusplus.github.io/CWG/issues/1474.html">1474</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>User-defined literals and <TT>&lt;inttypes.h&gt;</TT> format macros</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1475">
     <td><a href="https://cplusplus.github.io/CWG/issues/1475.html">1475</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.depend">dcl.attr.depend</a>]</td>
     <td>CD3</td>
     <td>Errors in <TT>[[carries_dependency]]</TT> example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1476">
     <td><a href="https://cplusplus.github.io/CWG/issues/1476.html">1476</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of user-defined type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1477">
     <td><a href="https://cplusplus.github.io/CWG/issues/1477.html">1477</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD3</td>
     <td>Definition of a <TT>friend</TT> outside its namespace</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1478">
     <td><a href="https://cplusplus.github.io/CWG/issues/1478.html">1478</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD6</td>
     <td><TT>template</TT> keyword for dependent template template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1479">
     <td><a href="https://cplusplus.github.io/CWG/issues/1479.html">1479</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>CD3</td>
     <td>Literal operators and default arguments</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1480">
     <td><a href="https://cplusplus.github.io/CWG/issues/1480.html">1480</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Constant initialization via non-constant temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1481">
     <td><a href="https://cplusplus.github.io/CWG/issues/1481.html">1481</a></td>
+    <td>[<a href="https://wg21.link/over.inc">over.inc</a>]</td>
     <td>CD3</td>
     <td>Increment/decrement operators with reference parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1482">
     <td><a href="https://cplusplus.github.io/CWG/issues/1482.html">1482</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>CD3</td>
     <td>Point of declaration of enumeration</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1483">
     <td><a href="https://cplusplus.github.io/CWG/issues/1483.html">1483</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Non-dependent <I>static_assert-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1484">
     <td><a href="https://cplusplus.github.io/CWG/issues/1484.html">1484</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>CD4</td>
     <td>Unused local classes of function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1485">
     <td><a href="https://cplusplus.github.io/CWG/issues/1485.html">1485</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>drafting</td>
     <td>Out-of-class definition of member unscoped opaque enumeration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1486">
     <td><a href="https://cplusplus.github.io/CWG/issues/1486.html">1486</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.funcaddr">temp.deduct.funcaddr</a>]</td>
     <td>drafting</td>
     <td>Base-derived conversion in member pointer deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1487">
     <td><a href="https://cplusplus.github.io/CWG/issues/1487.html">1487</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD3</td>
     <td>When are inheriting constructors declared?</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr class="open" id="1488">
     <td><a href="https://cplusplus.github.io/CWG/issues/1488.html">1488</a></td>
+    <td>[<a href="https://wg21.link/dcl.name">dcl.name</a>]</td>
     <td>drafting</td>
     <td><I>abstract-pack-declarator</I>s in <I>type-id</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1489">
     <td><a href="https://cplusplus.github.io/CWG/issues/1489.html">1489</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD3</td>
     <td>Is value-initialization of an array constant initialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1490">
     <td><a href="https://cplusplus.github.io/CWG/issues/1490.html">1490</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization from a string literal</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1491">
     <td><a href="https://cplusplus.github.io/CWG/issues/1491.html">1491</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD3</td>
     <td>Move construction and rvalue reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1492">
     <td><a href="https://cplusplus.github.io/CWG/issues/1492.html">1492</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Exception specifications on template destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1493">
     <td><a href="https://cplusplus.github.io/CWG/issues/1493.html">1493</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Criteria for move-construction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1494">
     <td><a href="https://cplusplus.github.io/CWG/issues/1494.html">1494</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Temporary initialization for reference binding in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1495">
     <td><a href="https://cplusplus.github.io/CWG/issues/1495.html">1495</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD3</td>
     <td>Partial specialization of variadic class template</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1496">
     <td><a href="https://cplusplus.github.io/CWG/issues/1496.html">1496</a></td>
+    <td>[<a href="https://wg21.link/class.name">class.name</a>]</td>
     <td>CD4</td>
     <td>Triviality with deleted and missing default constructors</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1497">
     <td><a href="https://cplusplus.github.io/CWG/issues/1497.html">1497</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Aggregate initialization with parenthesized string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1498">
     <td><a href="https://cplusplus.github.io/CWG/issues/1498.html">1498</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>dup</td>
     <td>Lifetime of temporaries in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1499">
     <td><a href="https://cplusplus.github.io/CWG/issues/1499.html">1499</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD7</td>
     <td>Missing case for deleted move assignment operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1500">
     <td><a href="https://cplusplus.github.io/CWG/issues/1500.html">1500</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.candidate">temp.dep.candidate</a>]</td>
     <td>CD6</td>
     <td>Name lookup of dependent conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1501">
     <td><a href="https://cplusplus.github.io/CWG/issues/1501.html">1501</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Nested braces in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1502">
     <td><a href="https://cplusplus.github.io/CWG/issues/1502.html">1502</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization of unions with member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1503">
     <td><a href="https://cplusplus.github.io/CWG/issues/1503.html">1503</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD3</td>
     <td>Exceptions during copy to exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1504">
     <td><a href="https://cplusplus.github.io/CWG/issues/1504.html">1504</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD3</td>
     <td>Pointer arithmetic after derived-base conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1505">
     <td><a href="https://cplusplus.github.io/CWG/issues/1505.html">1505</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>dup</td>
     <td>Direct binding of reference to temporary in list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1506">
     <td><a href="https://cplusplus.github.io/CWG/issues/1506.html">1506</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Value category of <TT>initializer_list</TT> object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1507">
     <td><a href="https://cplusplus.github.io/CWG/issues/1507.html">1507</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD3</td>
     <td>Value initialization with trivial inaccessible default constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1508">
     <td><a href="https://cplusplus.github.io/CWG/issues/1508.html">1508</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++14</td>
     <td>Template initializer-list constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1509">
     <td><a href="https://cplusplus.github.io/CWG/issues/1509.html">1509</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>C++14</td>
     <td>Definition of &#8220;non-template function&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1510">
     <td><a href="https://cplusplus.github.io/CWG/issues/1510.html">1510</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD3</td>
     <td>cv-qualified references via <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1511">
     <td><a href="https://cplusplus.github.io/CWG/issues/1511.html">1511</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD3</td>
     <td><TT>const volatile</TT> variables and the one-definition rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1512">
     <td><a href="https://cplusplus.github.io/CWG/issues/1512.html">1512</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD3</td>
     <td>Pointer comparison vs qualification conversions</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr class="open" id="1513">
     <td><a href="https://cplusplus.github.io/CWG/issues/1513.html">1513</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>drafting</td>
     <td><TT>initializer_list</TT> deduction failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1514">
     <td><a href="https://cplusplus.github.io/CWG/issues/1514.html">1514</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>C++14</td>
     <td>Ambiguity between enumeration definition and zero-length bit-field</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1515">
     <td><a href="https://cplusplus.github.io/CWG/issues/1515.html">1515</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Modulo 2<SUP><I>n</I></SUP> arithmetic for implicitly-unsigned types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1516">
     <td><a href="https://cplusplus.github.io/CWG/issues/1516.html">1516</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;virtual function call&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1517">
     <td><a href="https://cplusplus.github.io/CWG/issues/1517.html">1517</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>open</td>
     <td>Unclear/missing description of behavior during construction/destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1518">
     <td><a href="https://cplusplus.github.io/CWG/issues/1518.html">1518</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Explicit default constructors and copy-list-initialization</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1519">
     <td><a href="https://cplusplus.github.io/CWG/issues/1519.html">1519</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>NAD</td>
     <td>Conflicting default and variadic constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1520">
     <td><a href="https://cplusplus.github.io/CWG/issues/1520.html">1520</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>NAD</td>
     <td>Alias template specialization vs pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1521">
     <td><a href="https://cplusplus.github.io/CWG/issues/1521.html">1521</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>dup</td>
     <td><TT>T{</TT><I>expr</I><TT>}</TT> with reference types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1522">
     <td><a href="https://cplusplus.github.io/CWG/issues/1522.html">1522</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Access checking for <TT>initializer_list</TT> array initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1523">
     <td><a href="https://cplusplus.github.io/CWG/issues/1523.html">1523</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD5</td>
     <td>Point of declaration in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1524">
     <td><a href="https://cplusplus.github.io/CWG/issues/1524.html">1524</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Incompletely-defined class template base</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1525">
     <td><a href="https://cplusplus.github.io/CWG/issues/1525.html">1525</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>NAD</td>
     <td>Array bound inference in temporary array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1526">
     <td><a href="https://cplusplus.github.io/CWG/issues/1526.html">1526</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>dup</td>
     <td>Dependent-class lookup in the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1527">
     <td><a href="https://cplusplus.github.io/CWG/issues/1527.html">1527</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD3</td>
     <td>Assignment from <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1528">
     <td><a href="https://cplusplus.github.io/CWG/issues/1528.html">1528</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Repeated <I>cv-qualifier</I>s in declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1529">
     <td><a href="https://cplusplus.github.io/CWG/issues/1529.html">1529</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>drafting</td>
     <td>Nomenclature for variable vs reference non-static data member</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1530">
     <td><a href="https://cplusplus.github.io/CWG/issues/1530.html">1530</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>drafting</td>
     <td>Member access in out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1531">
     <td><a href="https://cplusplus.github.io/CWG/issues/1531.html">1531</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;access&#8221; (verb)</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1532">
     <td><a href="https://cplusplus.github.io/CWG/issues/1532.html">1532</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD3</td>
     <td>Explicit instantiation and member templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1533">
     <td><a href="https://cplusplus.github.io/CWG/issues/1533.html">1533</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>CD3</td>
     <td>Function pack expansion for member initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1534">
     <td><a href="https://cplusplus.github.io/CWG/issues/1534.html">1534</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>dup</td>
     <td>cv-qualification of prvalue of type &#8220;array of class&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1535">
     <td><a href="https://cplusplus.github.io/CWG/issues/1535.html">1535</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td><TT>typeid</TT> in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1536">
     <td><a href="https://cplusplus.github.io/CWG/issues/1536.html">1536</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>drafting</td>
     <td>Overload resolution with temporary from initializer list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1537">
     <td><a href="https://cplusplus.github.io/CWG/issues/1537.html">1537</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD3</td>
     <td>Optional compile-time evaluation of constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1538">
     <td><a href="https://cplusplus.github.io/CWG/issues/1538.html">1538</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD3</td>
     <td>C-style cast in <I>braced-init-list</I> assignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1539">
     <td><a href="https://cplusplus.github.io/CWG/issues/1539.html">1539</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD3</td>
     <td>Definition of &#8220;character type&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1540">
     <td><a href="https://cplusplus.github.io/CWG/issues/1540.html">1540</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Use of address constants in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1541">
     <td><a href="https://cplusplus.github.io/CWG/issues/1541.html">1541</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD3</td>
     <td><I>cv</I> <TT>void</TT> return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1542">
     <td><a href="https://cplusplus.github.io/CWG/issues/1542.html">1542</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>open</td>
     <td>Compound assignment of <I>braced-init-list</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1543">
     <td><a href="https://cplusplus.github.io/CWG/issues/1543.html">1543</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD3</td>
     <td>Implicit conversion sequence for empty initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1544">
     <td><a href="https://cplusplus.github.io/CWG/issues/1544.html">1544</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD3</td>
     <td>Linkage of member of unnamed namespace</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1545">
     <td><a href="https://cplusplus.github.io/CWG/issues/1545.html">1545</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td><TT>friend</TT> function templates defined in class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1546">
     <td><a href="https://cplusplus.github.io/CWG/issues/1546.html">1546</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>NAD</td>
     <td>Errors in function template default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1547">
     <td><a href="https://cplusplus.github.io/CWG/issues/1547.html">1547</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td><TT>typename</TT> keyword in <I>alias-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1548">
     <td><a href="https://cplusplus.github.io/CWG/issues/1548.html">1548</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Copy/move construction and conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1549">
     <td><a href="https://cplusplus.github.io/CWG/issues/1549.html">1549</a></td>
+    <td>[<a href="https://wg21.link/over.binary">over.binary</a>]</td>
     <td>open</td>
     <td>Overloaded comma operator with <TT>void</TT> operand</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1550">
     <td><a href="https://cplusplus.github.io/CWG/issues/1550.html">1550</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD3</td>
     <td>Parenthesized <I>throw-expression</I> operand of <I>conditional-expression</I></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="1551">
     <td><a href="https://cplusplus.github.io/CWG/issues/1551.html">1551</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>C++14</td>
     <td>Wording problems in <I>using-declaration</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1552">
     <td><a href="https://cplusplus.github.io/CWG/issues/1552.html">1552</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and defaulted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1553">
     <td><a href="https://cplusplus.github.io/CWG/issues/1553.html">1553</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>CD3</td>
     <td><TT>sizeof</TT> and xvalue bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1554">
     <td><a href="https://cplusplus.github.io/CWG/issues/1554.html">1554</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Access and alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1555">
     <td><a href="https://cplusplus.github.io/CWG/issues/1555.html">1555</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>NAD</td>
     <td>Language linkage and function type compatibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1556">
     <td><a href="https://cplusplus.github.io/CWG/issues/1556.html">1556</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD3</td>
     <td>Constructors and explicit conversion functions in direct initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1557">
     <td><a href="https://cplusplus.github.io/CWG/issues/1557.html">1557</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD3</td>
     <td>Language linkage of converted lambda function pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1558">
     <td><a href="https://cplusplus.github.io/CWG/issues/1558.html">1558</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD4</td>
     <td>Unused arguments in alias template specializations</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="1559">
     <td><a href="https://cplusplus.github.io/CWG/issues/1559.html">1559</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD3</td>
     <td>String too long in initializer list of <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1560">
     <td><a href="https://cplusplus.github.io/CWG/issues/1560.html">1560</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD3</td>
     <td>Gratuitous lvalue-to-rvalue conversion in <I>conditional-expression</I> with <I>throw-expression</I> operand</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1561">
     <td><a href="https://cplusplus.github.io/CWG/issues/1561.html">1561</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD4</td>
     <td>Aggregates with empty base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1562">
     <td><a href="https://cplusplus.github.io/CWG/issues/1562.html">1562</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++14</td>
     <td>Non-static data member initializers and union <I>ctor-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1563">
     <td><a href="https://cplusplus.github.io/CWG/issues/1563.html">1563</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD3</td>
     <td>List-initialization and overloaded function disambiguation</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1564">
     <td><a href="https://cplusplus.github.io/CWG/issues/1564.html">1564</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Template argument deduction from an initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1565">
     <td><a href="https://cplusplus.github.io/CWG/issues/1565.html">1565</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Copy elision and lifetime of <TT>initializer_list</TT> underlying array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1566">
     <td><a href="https://cplusplus.github.io/CWG/issues/1566.html">1566</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>NAD</td>
     <td>Should <TT>new std::initializer_list&lt;T&gt;</TT> be ill-formed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1567">
     <td><a href="https://cplusplus.github.io/CWG/issues/1567.html">1567</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>C++14</td>
     <td>Inheriting constructors and copy/move constructors</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="1568">
     <td><a href="https://cplusplus.github.io/CWG/issues/1568.html">1568</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>dup</td>
     <td>Temporary lifetime extension with intervening cast</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1569">
     <td><a href="https://cplusplus.github.io/CWG/issues/1569.html">1569</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++14</td>
     <td>Deducing a function parameter pack before ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1570">
     <td><a href="https://cplusplus.github.io/CWG/issues/1570.html">1570</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++14</td>
     <td>Address of subobject as non-type template argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1571">
     <td><a href="https://cplusplus.github.io/CWG/issues/1571.html">1571</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD4</td>
     <td>cv-qualification for indirect reference binding via conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1572">
     <td><a href="https://cplusplus.github.io/CWG/issues/1572.html">1572</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD4</td>
     <td>Incorrect example for rvalue reference binding via conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1573">
     <td><a href="https://cplusplus.github.io/CWG/issues/1573.html">1573</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inherited constructor characteristics</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1574">
     <td><a href="https://cplusplus.github.io/CWG/issues/1574.html">1574</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>NAD</td>
     <td>Explicitly-defaulted <TT>constexpr</TT> functions in wrapper templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1575">
     <td><a href="https://cplusplus.github.io/CWG/issues/1575.html">1575</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.safety">basic.stc.dynamic.safety</a>]</td>
     <td>C++14</td>
     <td>Incorrect definition of &#8220;strict pointer safety&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1576">
     <td><a href="https://cplusplus.github.io/CWG/issues/1576.html">1576</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++14</td>
     <td>Discarded-value volatile xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1577">
     <td><a href="https://cplusplus.github.io/CWG/issues/1577.html">1577</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>NAD</td>
     <td>Unnecessary restrictions on partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1578">
     <td><a href="https://cplusplus.github.io/CWG/issues/1578.html">1578</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>NAD</td>
     <td>Value-initialization of aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1579">
     <td><a href="https://cplusplus.github.io/CWG/issues/1579.html">1579</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>Return by converting move constructor</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr class="open" id="1580">
     <td><a href="https://cplusplus.github.io/CWG/issues/1580.html">1580</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>drafting</td>
     <td>Default arguments in explicit instantiations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1581">
     <td><a href="https://cplusplus.github.io/CWG/issues/1581.html">1581</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>When are <TT>constexpr</TT> member functions defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1582">
     <td><a href="https://cplusplus.github.io/CWG/issues/1582.html">1582</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>drafting</td>
     <td>Template default arguments and deduction failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1583">
     <td><a href="https://cplusplus.github.io/CWG/issues/1583.html">1583</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>C++14</td>
     <td>Incorrect example of unspecified behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1584">
     <td><a href="https://cplusplus.github.io/CWG/issues/1584.html">1584</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>drafting</td>
     <td>Deducing function types from cv-qualified types</td>
     <td align="center">
@@ -9335,1836 +10877,2142 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1585">
     <td><a href="https://cplusplus.github.io/CWG/issues/1585.html">1585</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>NAD</td>
     <td>Value category of member access of rvalue reference member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1586">
     <td><a href="https://cplusplus.github.io/CWG/issues/1586.html">1586</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>NAD</td>
     <td>Naming a destructor via <TT>decltype</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1587">
     <td><a href="https://cplusplus.github.io/CWG/issues/1587.html">1587</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td><TT>constexpr</TT> initialization and nested anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1588">
     <td><a href="https://cplusplus.github.io/CWG/issues/1588.html">1588</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD3</td>
     <td>Deducing cv-qualified <TT>auto</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1589">
     <td><a href="https://cplusplus.github.io/CWG/issues/1589.html">1589</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD4</td>
     <td>Ambiguous ranking of list-initialization sequences</td>
     <td class="full" align="center">Clang 3.7 (C++11 onwards)</td>
   </tr>
   <tr id="1590">
     <td><a href="https://cplusplus.github.io/CWG/issues/1590.html">1590</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Bypassing non-copy/move constructor copying</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1591">
     <td><a href="https://cplusplus.github.io/CWG/issues/1591.html">1591</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD4</td>
     <td>Deducing array bound and element type from initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1592">
     <td><a href="https://cplusplus.github.io/CWG/issues/1592.html">1592</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>C++14</td>
     <td>When do template parameters match?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1593">
     <td><a href="https://cplusplus.github.io/CWG/issues/1593.html">1593</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++14</td>
     <td>&#8220;Parameter type&#8221; of special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1594">
     <td><a href="https://cplusplus.github.io/CWG/issues/1594.html">1594</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Lazy declaration of special members vs overload errors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1595">
     <td><a href="https://cplusplus.github.io/CWG/issues/1595.html">1595</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td>Constructors &#8220;involved in&#8221; subobject initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1596">
     <td><a href="https://cplusplus.github.io/CWG/issues/1596.html">1596</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD4</td>
     <td>Non-array objects as <TT>array[1]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1597">
     <td><a href="https://cplusplus.github.io/CWG/issues/1597.html">1597</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD3</td>
     <td>Misleading <TT>constexpr</TT> example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1598">
     <td><a href="https://cplusplus.github.io/CWG/issues/1598.html">1598</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>C++14</td>
     <td>Criterion for equality of pointers to members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1599">
     <td><a href="https://cplusplus.github.io/CWG/issues/1599.html">1599</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Lifetime of <TT>initializer_list</TT> underlying array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1600">
     <td><a href="https://cplusplus.github.io/CWG/issues/1600.html">1600</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD4</td>
     <td>Erroneous reference initialization in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1601">
     <td><a href="https://cplusplus.github.io/CWG/issues/1601.html">1601</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>C++14</td>
     <td>Promotion of enumeration with fixed underlying type</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr class="open" id="1602">
     <td><a href="https://cplusplus.github.io/CWG/issues/1602.html">1602</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>review</td>
     <td>Linkage of specialization vs linkage of template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1603">
     <td><a href="https://cplusplus.github.io/CWG/issues/1603.html">1603</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD4</td>
     <td>Errors resulting from giving unnamed namespaces internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1604">
     <td><a href="https://cplusplus.github.io/CWG/issues/1604.html">1604</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>C++14</td>
     <td>Double temporaries in reference initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1605">
     <td><a href="https://cplusplus.github.io/CWG/issues/1605.html">1605</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD3</td>
     <td>Misleading parenthetical comment for explicit destructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1606">
     <td><a href="https://cplusplus.github.io/CWG/issues/1606.html">1606</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>NAD</td>
     <td><TT>sizeof</TT> closure class</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1607">
     <td><a href="https://cplusplus.github.io/CWG/issues/1607.html">1607</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td>Lambdas in template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1608">
     <td><a href="https://cplusplus.github.io/CWG/issues/1608.html">1608</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++14</td>
     <td>Operator lookup in trailing return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1609">
     <td><a href="https://cplusplus.github.io/CWG/issues/1609.html">1609</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Default arguments and function parameter packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1610">
     <td><a href="https://cplusplus.github.io/CWG/issues/1610.html">1610</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>drafting</td>
     <td>Cv-qualification in deduction of reference to array</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1611">
     <td><a href="https://cplusplus.github.io/CWG/issues/1611.html">1611</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++14</td>
     <td>Deleted default constructor for abstract class</td>
     <td class="full" align="center">Duplicate of <a href="#1658">1658</a></td>
   </tr>
   <tr id="1612">
     <td><a href="https://cplusplus.github.io/CWG/issues/1612.html">1612</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Implicit lambda capture and anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1613">
     <td><a href="https://cplusplus.github.io/CWG/issues/1613.html">1613</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Constant expressions and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1614">
     <td><a href="https://cplusplus.github.io/CWG/issues/1614.html">1614</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Address of pure virtual function vs odr-use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1615">
     <td><a href="https://cplusplus.github.io/CWG/issues/1615.html">1615</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD4</td>
     <td>Alignment of types, variables, and members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1616">
     <td><a href="https://cplusplus.github.io/CWG/issues/1616.html">1616</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>CD6</td>
     <td>Disambiguation parsing and template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1617">
     <td><a href="https://cplusplus.github.io/CWG/issues/1617.html">1617</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>open</td>
     <td><TT>alignas</TT> and non-defining declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1618">
     <td><a href="https://cplusplus.github.io/CWG/issues/1618.html">1618</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++14</td>
     <td>Gratuitously-unsigned underlying enum type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1619">
     <td><a href="https://cplusplus.github.io/CWG/issues/1619.html">1619</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>open</td>
     <td>Definition of current instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1620">
     <td><a href="https://cplusplus.github.io/CWG/issues/1620.html">1620</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>open</td>
     <td>User-defined literals and extended integer types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1621">
     <td><a href="https://cplusplus.github.io/CWG/issues/1621.html">1621</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++20</td>
     <td>Member initializers in anonymous unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1622">
     <td><a href="https://cplusplus.github.io/CWG/issues/1622.html">1622</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Empty aggregate initializer for union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1623">
     <td><a href="https://cplusplus.github.io/CWG/issues/1623.html">1623</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>drafting</td>
     <td>Deleted default union constructor and member initializers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1624">
     <td><a href="https://cplusplus.github.io/CWG/issues/1624.html">1624</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Destruction of union members with member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1625">
     <td><a href="https://cplusplus.github.io/CWG/issues/1625.html">1625</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>open</td>
     <td>Adding spaces between tokens in stringizing</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1626">
     <td><a href="https://cplusplus.github.io/CWG/issues/1626.html">1626</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>dup</td>
     <td><TT>constexpr</TT> member functions in <I>brace-or-equal-initializer</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1627">
     <td><a href="https://cplusplus.github.io/CWG/issues/1627.html">1627</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>NAD</td>
     <td>Agreement of dependent <TT>alignas</TT> specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1628">
     <td><a href="https://cplusplus.github.io/CWG/issues/1628.html">1628</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Deallocation function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1629">
     <td><a href="https://cplusplus.github.io/CWG/issues/1629.html">1629</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>C++14</td>
     <td>Can a closure class be a literal type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1630">
     <td><a href="https://cplusplus.github.io/CWG/issues/1630.html">1630</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Multiple default constructor templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1631">
     <td><a href="https://cplusplus.github.io/CWG/issues/1631.html">1631</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD4</td>
     <td>Incorrect overload resolution for single-element <I>initializer-list</I></td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1632">
     <td><a href="https://cplusplus.github.io/CWG/issues/1632.html">1632</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td>Lambda capture in member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1633">
     <td><a href="https://cplusplus.github.io/CWG/issues/1633.html">1633</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Copy-initialization in member initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1634">
     <td><a href="https://cplusplus.github.io/CWG/issues/1634.html">1634</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>open</td>
     <td>Temporary storage duration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1635">
     <td><a href="https://cplusplus.github.io/CWG/issues/1635.html">1635</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>How similar are template default arguments to function default arguments?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1636">
     <td><a href="https://cplusplus.github.io/CWG/issues/1636.html">1636</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD5</td>
     <td>Bits required for negative enumerator values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1637">
     <td><a href="https://cplusplus.github.io/CWG/issues/1637.html">1637</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Recursion in <TT>constexpr</TT> template default constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1638">
     <td><a href="https://cplusplus.github.io/CWG/issues/1638.html">1638</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Declaring an explicit specialization of a scoped enumeration</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1639">
     <td><a href="https://cplusplus.github.io/CWG/issues/1639.html">1639</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and pointer/pointer-to-member expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1640">
     <td><a href="https://cplusplus.github.io/CWG/issues/1640.html">1640</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD5</td>
     <td>Array of abstract instance of class template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1641">
     <td><a href="https://cplusplus.github.io/CWG/issues/1641.html">1641</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td>Assignment in member initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1642">
     <td><a href="https://cplusplus.github.io/CWG/issues/1642.html">1642</a></td>
+    <td>[<a href="https://wg21.link/expr.compound">expr.compound</a>]</td>
     <td>CD7</td>
     <td>Missing requirements for prvalue operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1643">
     <td><a href="https://cplusplus.github.io/CWG/issues/1643.html">1643</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Default arguments for template parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1644">
     <td><a href="https://cplusplus.github.io/CWG/issues/1644.html">1644</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>NAD</td>
     <td>Equivalent <I>exception-specification</I>s in function template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1645">
     <td><a href="https://cplusplus.github.io/CWG/issues/1645.html">1645</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Identical inheriting constructors via default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1646">
     <td><a href="https://cplusplus.github.io/CWG/issues/1646.html">1646</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD5</td>
     <td><I>decltype-specifier</I>s, abstract classes, and deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1647">
     <td><a href="https://cplusplus.github.io/CWG/issues/1647.html">1647</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>drafting</td>
     <td>Type agreement of non-type template arguments in partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1648">
     <td><a href="https://cplusplus.github.io/CWG/issues/1648.html">1648</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>C++14</td>
     <td><TT>thread_local</TT> vs block extern declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1649">
     <td><a href="https://cplusplus.github.io/CWG/issues/1649.html">1649</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>C++14</td>
     <td>Error in the syntax of <I>mem-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1650">
     <td><a href="https://cplusplus.github.io/CWG/issues/1650.html">1650</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Class prvalues in reference initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1651">
     <td><a href="https://cplusplus.github.io/CWG/issues/1651.html">1651</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Lifetime extension of temporary via reference to subobject</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1652">
     <td><a href="https://cplusplus.github.io/CWG/issues/1652.html">1652</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>CD4</td>
     <td>Object addresses in <TT>constexpr</TT> expressions</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="1653">
     <td><a href="https://cplusplus.github.io/CWG/issues/1653.html">1653</a></td>
+    <td>[<a href="https://wg21.link/expr.pre.incr">expr.pre.incr</a>]</td>
     <td>CD4</td>
     <td>Removing deprecated increment of <TT>bool</TT></td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr id="1654">
     <td><a href="https://cplusplus.github.io/CWG/issues/1654.html">1654</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>dup</td>
     <td>Literal types and <TT>constexpr</TT> defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1655">
     <td><a href="https://cplusplus.github.io/CWG/issues/1655.html">1655</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>open</td>
     <td>Line endings in raw string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1656">
     <td><a href="https://cplusplus.github.io/CWG/issues/1656.html">1656</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Encoding of numerically-escaped characters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1657">
     <td><a href="https://cplusplus.github.io/CWG/issues/1657.html">1657</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Attributes for namespaces and enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1658">
     <td><a href="https://cplusplus.github.io/CWG/issues/1658.html">1658</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++14</td>
     <td>Deleted default constructor for abstract class via destructor</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr class="open" id="1659">
     <td><a href="https://cplusplus.github.io/CWG/issues/1659.html">1659</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>open</td>
     <td>Initialization order of thread_local template static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1660">
     <td><a href="https://cplusplus.github.io/CWG/issues/1660.html">1660</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++14</td>
     <td><I>member-declaration</I> requirements and unnamed bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1661">
     <td><a href="https://cplusplus.github.io/CWG/issues/1661.html">1661</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>NAD</td>
     <td>Preservation of infinite loops</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1662">
     <td><a href="https://cplusplus.github.io/CWG/issues/1662.html">1662</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Capturing function parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1663">
     <td><a href="https://cplusplus.github.io/CWG/issues/1663.html">1663</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Capturing an empty pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1664">
     <td><a href="https://cplusplus.github.io/CWG/issues/1664.html">1664</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td>Argument-dependent lookup of lambdas used in default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1665">
     <td><a href="https://cplusplus.github.io/CWG/issues/1665.html">1665</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Declaration matching in explicit instantiations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1666">
     <td><a href="https://cplusplus.github.io/CWG/issues/1666.html">1666</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++14</td>
     <td>Address constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1667">
     <td><a href="https://cplusplus.github.io/CWG/issues/1667.html">1667</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>NAD</td>
     <td>Function exiting via exception called by destructor during unwinding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1668">
     <td><a href="https://cplusplus.github.io/CWG/issues/1668.html">1668</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>drafting</td>
     <td>Parameter type determination still not clear enough</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1669">
     <td><a href="https://cplusplus.github.io/CWG/issues/1669.html">1669</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>C++14</td>
     <td><TT>auto</TT> return type for <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1670">
     <td><a href="https://cplusplus.github.io/CWG/issues/1670.html">1670</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>review</td>
     <td><TT>auto</TT> as <I>conversion-type-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1671">
     <td><a href="https://cplusplus.github.io/CWG/issues/1671.html">1671</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Unclear rules for deduction with cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1672">
     <td><a href="https://cplusplus.github.io/CWG/issues/1672.html">1672</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Layout compatibility with multiple empty bases</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1673">
     <td><a href="https://cplusplus.github.io/CWG/issues/1673.html">1673</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>C++14</td>
     <td>Clarifying overload resolution for the second step of copy-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1674">
     <td><a href="https://cplusplus.github.io/CWG/issues/1674.html">1674</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++14</td>
     <td>Return type deduction for address of function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1675">
     <td><a href="https://cplusplus.github.io/CWG/issues/1675.html">1675</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>NAD</td>
     <td>Size limit for automatic array object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1676">
     <td><a href="https://cplusplus.github.io/CWG/issues/1676.html">1676</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>drafting</td>
     <td><TT>auto</TT> return type for allocation and deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1677">
     <td><a href="https://cplusplus.github.io/CWG/issues/1677.html">1677</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++17</td>
     <td>Constant initialization via aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1678">
     <td><a href="https://cplusplus.github.io/CWG/issues/1678.html">1678</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>NAD</td>
     <td>Naming the type of an array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1679">
     <td><a href="https://cplusplus.github.io/CWG/issues/1679.html">1679</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>NAD</td>
     <td>Range-based <TT>for</TT> and array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1680">
     <td><a href="https://cplusplus.github.io/CWG/issues/1680.html">1680</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>drafting</td>
     <td>Including <TT>&lt;initializer_list&gt;</TT> for range-based <TT>for</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1681">
     <td><a href="https://cplusplus.github.io/CWG/issues/1681.html">1681</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td><I>init-capture</I>s and nested lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1682">
     <td><a href="https://cplusplus.github.io/CWG/issues/1682.html">1682</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>open</td>
     <td>Overly-restrictive rules on function templates as allocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1683">
     <td><a href="https://cplusplus.github.io/CWG/issues/1683.html">1683</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Incorrect example after <TT>constexpr</TT> changes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1684">
     <td><a href="https://cplusplus.github.io/CWG/issues/1684.html">1684</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++14</td>
     <td>Static <TT>constexpr</TT> member functions for non-literal classes</td>
     <td class="full" align="center">Clang 3.6</td>
   </tr>
   <tr id="1685">
     <td><a href="https://cplusplus.github.io/CWG/issues/1685.html">1685</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>NAD</td>
     <td>Value category of <TT>noexcept</TT> expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1686">
     <td><a href="https://cplusplus.github.io/CWG/issues/1686.html">1686</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD4</td>
     <td>Which variables are &#8220;explicitly declared <TT>const</TT>?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1687">
     <td><a href="https://cplusplus.github.io/CWG/issues/1687.html">1687</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++14</td>
     <td>Conversions of operands of built-in operators</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1688">
     <td><a href="https://cplusplus.github.io/CWG/issues/1688.html">1688</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Volatile <TT>constexpr</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1689">
     <td><a href="https://cplusplus.github.io/CWG/issues/1689.html">1689</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++14</td>
     <td>Syntactic nonterminal for operand of <TT>alignas</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1690">
     <td><a href="https://cplusplus.github.io/CWG/issues/1690.html">1690</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Associated namespace for local type</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1691">
     <td><a href="https://cplusplus.github.io/CWG/issues/1691.html">1691</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Argument-dependent lookup and opaque enumerations</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1692">
     <td><a href="https://cplusplus.github.io/CWG/issues/1692.html">1692</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>C++14</td>
     <td>Associated namespaces of doubly-nested classes</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1693">
     <td><a href="https://cplusplus.github.io/CWG/issues/1693.html">1693</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++14</td>
     <td>Superfluous semicolons in class definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1694">
     <td><a href="https://cplusplus.github.io/CWG/issues/1694.html">1694</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Restriction on reference to temporary as a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1695">
     <td><a href="https://cplusplus.github.io/CWG/issues/1695.html">1695</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>NAD</td>
     <td>Lifetime extension via <I>init-capture</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1696">
     <td><a href="https://cplusplus.github.io/CWG/issues/1696.html">1696</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Temporary lifetime and non-static data member initializers</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1697">
     <td><a href="https://cplusplus.github.io/CWG/issues/1697.html">1697</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Lifetime extension and copy elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1698">
     <td><a href="https://cplusplus.github.io/CWG/issues/1698.html">1698</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Files ending in <TT>\</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1699">
     <td><a href="https://cplusplus.github.io/CWG/issues/1699.html">1699</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>open</td>
     <td>Does befriending a class befriend its friends?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1700">
     <td><a href="https://cplusplus.github.io/CWG/issues/1700.html">1700</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>NAD</td>
     <td>Does the special rvalue-reference deduction apply to alias templates?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1701">
     <td><a href="https://cplusplus.github.io/CWG/issues/1701.html">1701</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>open</td>
     <td>Array vs sequence in object representation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1702">
     <td><a href="https://cplusplus.github.io/CWG/issues/1702.html">1702</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>drafting</td>
     <td>Rephrasing the definition of &#8220;anonymous union&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1703">
     <td><a href="https://cplusplus.github.io/CWG/issues/1703.html">1703</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td>Language linkage of names of functions with internal linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1704">
     <td><a href="https://cplusplus.github.io/CWG/issues/1704.html">1704</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Type checking in explicit instantiation of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1705">
     <td><a href="https://cplusplus.github.io/CWG/issues/1705.html">1705</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD4</td>
     <td>Unclear specification of &#8220;more specialized&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1706">
     <td><a href="https://cplusplus.github.io/CWG/issues/1706.html">1706</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>drafting</td>
     <td><TT>alignas</TT> pack expansion syntax</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1707">
     <td><a href="https://cplusplus.github.io/CWG/issues/1707.html">1707</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>C++14</td>
     <td><TT>template</TT> in <I>elaborated-type-specifier</I> without <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1708">
     <td><a href="https://cplusplus.github.io/CWG/issues/1708.html">1708</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD4</td>
     <td>overly-strict requirements for names with C language linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1709">
     <td><a href="https://cplusplus.github.io/CWG/issues/1709.html">1709</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>open</td>
     <td>Stringizing raw string literals containing newline</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1710">
     <td><a href="https://cplusplus.github.io/CWG/issues/1710.html">1710</a></td>
+    <td>[<a href="https://wg21.link/class.derived">class.derived</a>]</td>
     <td>C++17</td>
     <td>Missing <TT>template</TT> keyword in <I>class-or-decltype</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1711">
     <td><a href="https://cplusplus.github.io/CWG/issues/1711.html">1711</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>CD6</td>
     <td>Missing specification of variable template partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1712">
     <td><a href="https://cplusplus.github.io/CWG/issues/1712.html">1712</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td><TT>constexpr</TT> variable template declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1713">
     <td><a href="https://cplusplus.github.io/CWG/issues/1713.html">1713</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>dup</td>
     <td>Linkage of variable template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1714">
     <td><a href="https://cplusplus.github.io/CWG/issues/1714.html">1714</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>NAD</td>
     <td>odr-use of <TT>this</TT> from a local class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1715">
     <td><a href="https://cplusplus.github.io/CWG/issues/1715.html">1715</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Access and inherited constructor templates</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1716">
     <td><a href="https://cplusplus.github.io/CWG/issues/1716.html">1716</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>C++14</td>
     <td>When are default arguments evaluated?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1717">
     <td><a href="https://cplusplus.github.io/CWG/issues/1717.html">1717</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>C++14</td>
     <td>Missing specification of type of binary literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1718">
     <td><a href="https://cplusplus.github.io/CWG/issues/1718.html">1718</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>open</td>
     <td>Macro invocation spanning end-of-file</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1719">
     <td><a href="https://cplusplus.github.io/CWG/issues/1719.html">1719</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Layout compatibility and cv-qualification revisited</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="1720">
     <td><a href="https://cplusplus.github.io/CWG/issues/1720.html">1720</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>NAD</td>
     <td>Macro invocation in <TT>#include</TT> directive</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1721">
     <td><a href="https://cplusplus.github.io/CWG/issues/1721.html">1721</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>review</td>
     <td>Diagnosing ODR violations for static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1722">
     <td><a href="https://cplusplus.github.io/CWG/issues/1722.html">1722</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Should lambda to function pointer conversion function be <TT>noexcept</TT>?</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr class="open" id="1723">
     <td><a href="https://cplusplus.github.io/CWG/issues/1723.html">1723</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td>Multicharacter user-defined character literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1724">
     <td><a href="https://cplusplus.github.io/CWG/issues/1724.html">1724</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Unclear rules for deduction failure</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1725">
     <td><a href="https://cplusplus.github.io/CWG/issues/1725.html">1725</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Trailing return type with nested function declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1726">
     <td><a href="https://cplusplus.github.io/CWG/issues/1726.html">1726</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD6</td>
     <td>Declarator operators and conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1727">
     <td><a href="https://cplusplus.github.io/CWG/issues/1727.html">1727</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Type of a specialization of a variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1728">
     <td><a href="https://cplusplus.github.io/CWG/issues/1728.html">1728</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Type of an explicit instantiation of a variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1729">
     <td><a href="https://cplusplus.github.io/CWG/issues/1729.html">1729</a></td>
+    <td>[<a href="https://wg21.link/temp.decls">temp.decls</a>]</td>
     <td>CD6</td>
     <td>Matching declarations and definitions of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1730">
     <td><a href="https://cplusplus.github.io/CWG/issues/1730.html">1730</a></td>
+    <td>[<a href="https://wg21.link/temp.decls">temp.decls</a>]</td>
     <td>drafting</td>
     <td>Can a variable template have an unnamed type?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1731">
     <td><a href="https://cplusplus.github.io/CWG/issues/1731.html">1731</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td><TT>is_trivially_</TT><I>X</I> and definitions of special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1732">
     <td><a href="https://cplusplus.github.io/CWG/issues/1732.html">1732</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>C++14</td>
     <td>Defining types in <I>condition</I>s and range-based <TT>for</TT> statements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1733">
     <td><a href="https://cplusplus.github.io/CWG/issues/1733.html">1733</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD6</td>
     <td>Return type and value for <TT>operator=</TT> with <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1734">
     <td><a href="https://cplusplus.github.io/CWG/issues/1734.html">1734</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Nontrivial deleted copy functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="1735">
     <td><a href="https://cplusplus.github.io/CWG/issues/1735.html">1735</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>open</td>
     <td>Out-of-range literals in <I>user-defined-literal</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1736">
     <td><a href="https://cplusplus.github.io/CWG/issues/1736.html">1736</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inheriting constructor templates in a local class</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1737">
     <td><a href="https://cplusplus.github.io/CWG/issues/1737.html">1737</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++14</td>
     <td>Type dependence of call to a member of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1738">
     <td><a href="https://cplusplus.github.io/CWG/issues/1738.html">1738</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>C++14</td>
     <td>Explicit instantiation/specialization of inheriting constructor templates</td>
     <td class="na" align="center">Superseded by <a href="https://wg21.link/P0136R1">P0136R1</a></td>
   </tr>
   <tr id="1739">
     <td><a href="https://cplusplus.github.io/CWG/issues/1739.html">1739</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++14</td>
     <td>Conversion of floating point to enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1740">
     <td><a href="https://cplusplus.github.io/CWG/issues/1740.html">1740</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++14</td>
     <td>Disambiguation of <TT>noexcept</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1741">
     <td><a href="https://cplusplus.github.io/CWG/issues/1741.html">1741</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++14</td>
     <td>odr-use of class object in lvalue-to-rvalue conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1742">
     <td><a href="https://cplusplus.github.io/CWG/issues/1742.html">1742</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD5</td>
     <td><I>using-declaration</I>s and scoped enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1743">
     <td><a href="https://cplusplus.github.io/CWG/issues/1743.html">1743</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td><I>init-capture</I>s in nested lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1744">
     <td><a href="https://cplusplus.github.io/CWG/issues/1744.html">1744</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD4</td>
     <td>Unordered initialization for variable template specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1745">
     <td><a href="https://cplusplus.github.io/CWG/issues/1745.html">1745</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>thread_local constexpr</TT> variable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1746">
     <td><a href="https://cplusplus.github.io/CWG/issues/1746.html">1746</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++14</td>
     <td>Are volatile scalar types trivially copyable?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1747">
     <td><a href="https://cplusplus.github.io/CWG/issues/1747.html">1747</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>C++14</td>
     <td>Constant initialization of reference to function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1748">
     <td><a href="https://cplusplus.github.io/CWG/issues/1748.html">1748</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Placement new with a null pointer</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1749">
     <td><a href="https://cplusplus.github.io/CWG/issues/1749.html">1749</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>NAD</td>
     <td>Confusing definition for constant initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1750">
     <td><a href="https://cplusplus.github.io/CWG/issues/1750.html">1750</a></td>
+    <td>[<a href="https://wg21.link/over.match.copy">over.match.copy</a>]</td>
     <td>CD4</td>
     <td>&#8220;Argument&#8221; vs &#8220;parameter&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1751">
     <td><a href="https://cplusplus.github.io/CWG/issues/1751.html">1751</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Non-trivial operations vs non-trivial initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1752">
     <td><a href="https://cplusplus.github.io/CWG/issues/1752.html">1752</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD4</td>
     <td>Right-recursion in <I>mem-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1753">
     <td><a href="https://cplusplus.github.io/CWG/issues/1753.html">1753</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD4</td>
     <td><I>decltype-specifier</I> in <I>nested-name-specifier</I> of destructor</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1754">
     <td><a href="https://cplusplus.github.io/CWG/issues/1754.html">1754</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>NAD</td>
     <td>Declaration of partial specialization of static data member template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1755">
     <td><a href="https://cplusplus.github.io/CWG/issues/1755.html">1755</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.member">temp.spec.partial.member</a>]</td>
     <td>drafting</td>
     <td>Out-of-class partial specializations of member templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1756">
     <td><a href="https://cplusplus.github.io/CWG/issues/1756.html">1756</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>Direct-list-initialization of a non-class object</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1757">
     <td><a href="https://cplusplus.github.io/CWG/issues/1757.html">1757</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Const integral subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1758">
     <td><a href="https://cplusplus.github.io/CWG/issues/1758.html">1758</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>CD4</td>
     <td>Explicit conversion in copy/move list initialization</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1759">
     <td><a href="https://cplusplus.github.io/CWG/issues/1759.html">1759</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>C++14</td>
     <td>UTF-8 code units in plain <TT>char</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1760">
     <td><a href="https://cplusplus.github.io/CWG/issues/1760.html">1760</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++14</td>
     <td>Access of member corresponding to <I>init-capture</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1761">
     <td><a href="https://cplusplus.github.io/CWG/issues/1761.html">1761</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Runtime check on size of automatic array</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1762">
     <td><a href="https://cplusplus.github.io/CWG/issues/1762.html">1762</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>C++14</td>
     <td>Reserved identifier used in <I>literal-operator-id</I> example</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr class="open" id="1763">
     <td><a href="https://cplusplus.github.io/CWG/issues/1763.html">1763</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Length mismatch in template type deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1764">
     <td><a href="https://cplusplus.github.io/CWG/issues/1764.html">1764</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>C++14</td>
     <td>Hiding of function from using-declaration by signature</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1765">
     <td><a href="https://cplusplus.github.io/CWG/issues/1765.html">1765</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++14</td>
     <td>Overflow of enumeration used as enumerator value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1766">
     <td><a href="https://cplusplus.github.io/CWG/issues/1766.html">1766</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Values outside the range of the values of an enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1767">
     <td><a href="https://cplusplus.github.io/CWG/issues/1767.html">1767</a></td>
+    <td>[<a href="https://wg21.link/stmt.switch">stmt.switch</a>]</td>
     <td>C++14</td>
     <td>Scoped enumeration in a <TT>switch</TT> statement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1768">
     <td><a href="https://cplusplus.github.io/CWG/issues/1768.html">1768</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>NAD</td>
     <td>Zero-element array of runtime bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1769">
     <td><a href="https://cplusplus.github.io/CWG/issues/1769.html">1769</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>C++14</td>
     <td>Catching a base class of the exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1770">
     <td><a href="https://cplusplus.github.io/CWG/issues/1770.html">1770</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>C++14</td>
     <td>Type matching of non-type template parameters and arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1771">
     <td><a href="https://cplusplus.github.io/CWG/issues/1771.html">1771</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD6</td>
     <td>Restricted lookup in <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1772">
     <td><a href="https://cplusplus.github.io/CWG/issues/1772.html">1772</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>C++14</td>
     <td><TT>__func__</TT> in a lambda body</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1773">
     <td><a href="https://cplusplus.github.io/CWG/issues/1773.html">1773</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>C++14</td>
     <td>Out-of-lifetime lvalue-to-rvalue conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1774">
     <td><a href="https://cplusplus.github.io/CWG/issues/1774.html">1774</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Discrepancy between subobject destruction and stack unwinding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1775">
     <td><a href="https://cplusplus.github.io/CWG/issues/1775.html">1775</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++14</td>
     <td>Undefined behavior of line splice in raw string literal</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1776">
     <td><a href="https://cplusplus.github.io/CWG/issues/1776.html">1776</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD4</td>
     <td>Replacement of class objects containing reference members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1777">
     <td><a href="https://cplusplus.github.io/CWG/issues/1777.html">1777</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Empty pack expansion in <I>dynamic-exception-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1778">
     <td><a href="https://cplusplus.github.io/CWG/issues/1778.html">1778</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++14</td>
     <td><I>exception-specification</I> in explicitly-defaulted functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1779">
     <td><a href="https://cplusplus.github.io/CWG/issues/1779.html">1779</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD4</td>
     <td>Type dependency of <TT>__func__</TT></td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="1780">
     <td><a href="https://cplusplus.github.io/CWG/issues/1780.html">1780</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Explicit instantiation/specialization of generic lambda <TT>operator()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1781">
     <td><a href="https://cplusplus.github.io/CWG/issues/1781.html">1781</a></td>
+    <td>[<a href="https://wg21.link/over.match.conv">over.match.conv</a>]</td>
     <td>CD5</td>
     <td>Converting from <TT>nullptr_t</TT> to <TT>bool</TT> in overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1782">
     <td><a href="https://cplusplus.github.io/CWG/issues/1782.html">1782</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>CD4</td>
     <td>Form of initialization for <TT>nullptr_t</TT> to <TT>bool</TT> conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1783">
     <td><a href="https://cplusplus.github.io/CWG/issues/1783.html">1783</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>NAD</td>
     <td>Why are virtual destructors non-trivial?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1784">
     <td><a href="https://cplusplus.github.io/CWG/issues/1784.html">1784</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>C++17</td>
     <td>Concurrent execution during static local initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1785">
     <td><a href="https://cplusplus.github.io/CWG/issues/1785.html">1785</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Conflicting diagnostic requirements for template definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1786">
     <td><a href="https://cplusplus.github.io/CWG/issues/1786.html">1786</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++14</td>
     <td>Effect of merging allocations on memory leakage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1787">
     <td><a href="https://cplusplus.github.io/CWG/issues/1787.html">1787</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>C++14</td>
     <td>Uninitialized <TT>unsigned char</TT> values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1788">
     <td><a href="https://cplusplus.github.io/CWG/issues/1788.html">1788</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD4</td>
     <td>Sized deallocation of array of non-class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1789">
     <td><a href="https://cplusplus.github.io/CWG/issues/1789.html">1789</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Array reference vs array decay in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1790">
     <td><a href="https://cplusplus.github.io/CWG/issues/1790.html">1790</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Ellipsis following function parameter pack</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1791">
     <td><a href="https://cplusplus.github.io/CWG/issues/1791.html">1791</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD4</td>
     <td>Incorrect restrictions on <I>cv-qualifier-seq</I> and <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1792">
     <td><a href="https://cplusplus.github.io/CWG/issues/1792.html">1792</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Incorrect example of explicit specialization of member enumeration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1793">
     <td><a href="https://cplusplus.github.io/CWG/issues/1793.html">1793</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><TT>thread_local</TT> in explicit specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1794">
     <td><a href="https://cplusplus.github.io/CWG/issues/1794.html">1794</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td><TT>template</TT> keyword and alias templates</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1795">
     <td><a href="https://cplusplus.github.io/CWG/issues/1795.html">1795</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Disambiguating <I>original-namespace-definition</I> and <I>extension-namespace-definition</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1796">
     <td><a href="https://cplusplus.github.io/CWG/issues/1796.html">1796</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>CD4</td>
     <td>Is all-bits-zero for null characters a meaningful requirement?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1797">
     <td><a href="https://cplusplus.github.io/CWG/issues/1797.html">1797</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD4</td>
     <td>Are all bit patterns of <TT>unsigned char</TT> distinct numbers?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1798">
     <td><a href="https://cplusplus.github.io/CWG/issues/1798.html">1798</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td><I>exception-specification</I>s of template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1799">
     <td><a href="https://cplusplus.github.io/CWG/issues/1799.html">1799</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><TT>mutable</TT> and non-explicit const qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1800">
     <td><a href="https://cplusplus.github.io/CWG/issues/1800.html">1800</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD4</td>
     <td>Pointer to member of nested anonymous union</td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1801">
     <td><a href="https://cplusplus.github.io/CWG/issues/1801.html">1801</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD4</td>
     <td>Kind of expression referring to member of anonymous union</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="1802">
     <td><a href="https://cplusplus.github.io/CWG/issues/1802.html">1802</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD4</td>
     <td><TT>char16_t</TT> string literals and surrogate pairs</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1803">
     <td><a href="https://cplusplus.github.io/CWG/issues/1803.html">1803</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td><I>opaque-enum-declaration</I> as <I>member-declaration</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1804">
     <td><a href="https://cplusplus.github.io/CWG/issues/1804.html">1804</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD4</td>
     <td>Partial specialization and friendship</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1805">
     <td><a href="https://cplusplus.github.io/CWG/issues/1805.html">1805</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Conversions of array operands in <I>conditional-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1806">
     <td><a href="https://cplusplus.github.io/CWG/issues/1806.html">1806</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD4</td>
     <td>Virtual bases and move-assignment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1807">
     <td><a href="https://cplusplus.github.io/CWG/issues/1807.html">1807</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Order of destruction of array elements after an exception</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr class="open" id="1808">
     <td><a href="https://cplusplus.github.io/CWG/issues/1808.html">1808</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>drafting</td>
     <td>Constructor templates vs default constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1809">
     <td><a href="https://cplusplus.github.io/CWG/issues/1809.html">1809</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD4</td>
     <td>Narrowing and template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1810">
     <td><a href="https://cplusplus.github.io/CWG/issues/1810.html">1810</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>CD4</td>
     <td>Invalid <I>ud-suffix</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1811">
     <td><a href="https://cplusplus.github.io/CWG/issues/1811.html">1811</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Lookup of deallocation function in a virtual destructor definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1812">
     <td><a href="https://cplusplus.github.io/CWG/issues/1812.html">1812</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>C++17</td>
     <td>Omission of <TT>template</TT> in a <I>typename-specifier</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1813">
     <td><a href="https://cplusplus.github.io/CWG/issues/1813.html">1813</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Direct vs indirect bases in standard-layout classes</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1814">
     <td><a href="https://cplusplus.github.io/CWG/issues/1814.html">1814</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD4</td>
     <td>Default arguments in <I>lambda-expression</I>s</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1815">
     <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD4</td>
     <td>Lifetime extension in aggregate initialization</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="1816">
     <td><a href="https://cplusplus.github.io/CWG/issues/1816.html">1816</a></td>
+    <td>[<a href="https://wg21.link/conv.integral">conv.integral</a>]</td>
     <td>CD4</td>
     <td>Unclear specification of bit-field values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1817">
     <td><a href="https://cplusplus.github.io/CWG/issues/1817.html">1817</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>open</td>
     <td>Linkage specifications and nested scopes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1818">
     <td><a href="https://cplusplus.github.io/CWG/issues/1818.html">1818</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>Visibility and inherited language linkage</td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="1819">
     <td><a href="https://cplusplus.github.io/CWG/issues/1819.html">1819</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Acceptable scopes for definition of partial specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1820">
     <td><a href="https://cplusplus.github.io/CWG/issues/1820.html">1820</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td>Qualified typedef names</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1821">
     <td><a href="https://cplusplus.github.io/CWG/issues/1821.html">1821</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD6</td>
     <td>Qualified redeclarations in a class <I>member-specification</I></td>
     <td class="full" align="center">Clang 2.9</td>
   </tr>
   <tr id="1822">
     <td><a href="https://cplusplus.github.io/CWG/issues/1822.html">1822</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD6</td>
     <td>Lookup of parameter names in <I>lambda-expression</I>s</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="1823">
     <td><a href="https://cplusplus.github.io/CWG/issues/1823.html">1823</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.spec">dcl.fct.spec</a>]</td>
     <td>CD4</td>
     <td>String literal uniqueness in inline functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1824">
     <td><a href="https://cplusplus.github.io/CWG/issues/1824.html">1824</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD4</td>
     <td>Completeness of return type vs point of instantiation</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1825">
     <td><a href="https://cplusplus.github.io/CWG/issues/1825.html">1825</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>C++17</td>
     <td>Partial ordering between variadic and non-variadic function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1826">
     <td><a href="https://cplusplus.github.io/CWG/issues/1826.html">1826</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>const</TT> floating-point in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1827">
     <td><a href="https://cplusplus.github.io/CWG/issues/1827.html">1827</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>drafting</td>
     <td>Reference binding with ambiguous conversions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1828">
     <td><a href="https://cplusplus.github.io/CWG/issues/1828.html">1828</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual">basic.lookup.qual</a>]</td>
     <td>CD6</td>
     <td><I>nested-name-specifier</I> ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1829">
     <td><a href="https://cplusplus.github.io/CWG/issues/1829.html">1829</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Dependent unnamed types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1830">
     <td><a href="https://cplusplus.github.io/CWG/issues/1830.html">1830</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD4</td>
     <td>Repeated specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1831">
     <td><a href="https://cplusplus.github.io/CWG/issues/1831.html">1831</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Explicitly vs implicitly deleted move constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1832">
     <td><a href="https://cplusplus.github.io/CWG/issues/1832.html">1832</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD4</td>
     <td>Casting to incomplete enumeration</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="1833">
     <td><a href="https://cplusplus.github.io/CWG/issues/1833.html">1833</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td><TT>friend</TT> declarations naming implicitly-declared member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1834">
     <td><a href="https://cplusplus.github.io/CWG/issues/1834.html">1834</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD4</td>
     <td>Constant initialization binding a reference to an xvalue</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1835">
     <td><a href="https://cplusplus.github.io/CWG/issues/1835.html">1835</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Dependent member lookup before <TT>&lt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1836">
     <td><a href="https://cplusplus.github.io/CWG/issues/1836.html">1836</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD5</td>
     <td>Use of class type being defined in <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1837">
     <td><a href="https://cplusplus.github.io/CWG/issues/1837.html">1837</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD6</td>
     <td>Use of <TT>this</TT> in <TT>friend</TT> and local class declarations</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="1838">
     <td><a href="https://cplusplus.github.io/CWG/issues/1838.html">1838</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>CD4</td>
     <td>Definition via <I>unqualified-id</I> and <I>using-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1839">
     <td><a href="https://cplusplus.github.io/CWG/issues/1839.html">1839</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Lookup of block-scope <TT>extern</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1840">
     <td><a href="https://cplusplus.github.io/CWG/issues/1840.html">1840</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>drafting</td>
     <td>Non-deleted explicit specialization of deleted function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1841">
     <td><a href="https://cplusplus.github.io/CWG/issues/1841.html">1841</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>CD6</td>
     <td><TT>&lt;</TT> following template injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1842">
     <td><a href="https://cplusplus.github.io/CWG/issues/1842.html">1842</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>open</td>
     <td>Unevaluated operands and &#8220;carries a dependency&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1843">
     <td><a href="https://cplusplus.github.io/CWG/issues/1843.html">1843</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Bit-field in conditional operator with <TT>throw</TT> operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1844">
     <td><a href="https://cplusplus.github.io/CWG/issues/1844.html">1844</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Defining &#8220;immediate context&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1845">
     <td><a href="https://cplusplus.github.io/CWG/issues/1845.html">1845</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>review</td>
     <td>Point of instantiation of a variable template specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1846">
     <td><a href="https://cplusplus.github.io/CWG/issues/1846.html">1846</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD4</td>
     <td>Declaring explicitly-defaulted implicitly-deleted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1847">
     <td><a href="https://cplusplus.github.io/CWG/issues/1847.html">1847</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD4</td>
     <td>Clarifying compatibility during partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1848">
     <td><a href="https://cplusplus.github.io/CWG/issues/1848.html">1848</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Parenthesized constructor and destructor declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1849">
     <td><a href="https://cplusplus.github.io/CWG/issues/1849.html">1849</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD6</td>
     <td>Variable templates and the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1850">
     <td><a href="https://cplusplus.github.io/CWG/issues/1850.html">1850</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD4</td>
     <td>Differences between definition context and point of instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1851">
     <td><a href="https://cplusplus.github.io/CWG/issues/1851.html">1851</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1852">
     <td><a href="https://cplusplus.github.io/CWG/issues/1852.html">1852</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD4</td>
     <td>Wording issues regarding <TT>decltype(auto)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1853">
     <td><a href="https://cplusplus.github.io/CWG/issues/1853.html">1853</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>dup</td>
     <td>Defining &#8220;allocated storage&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1854">
     <td><a href="https://cplusplus.github.io/CWG/issues/1854.html">1854</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>drafting</td>
     <td>Disallowing use of implicitly-deleted functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1855">
     <td><a href="https://cplusplus.github.io/CWG/issues/1855.html">1855</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>dup</td>
     <td>Out-of-lifetime access to nonstatic data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1856">
     <td><a href="https://cplusplus.github.io/CWG/issues/1856.html">1856</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Indirect nested classes of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1857">
     <td><a href="https://cplusplus.github.io/CWG/issues/1857.html">1857</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD5</td>
     <td>Additional questions about bits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1858">
     <td><a href="https://cplusplus.github.io/CWG/issues/1858.html">1858</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>CD4</td>
     <td>Comparing pointers to union members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1859">
     <td><a href="https://cplusplus.github.io/CWG/issues/1859.html">1859</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD5</td>
     <td>UTF-16 in <TT>char16_t</TT> string literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1860">
     <td><a href="https://cplusplus.github.io/CWG/issues/1860.html">1860</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>C++17</td>
     <td>What is a &#8220;direct member?&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1861">
     <td><a href="https://cplusplus.github.io/CWG/issues/1861.html">1861</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD4</td>
     <td>Values of a bit-field</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1862">
     <td><a href="https://cplusplus.github.io/CWG/issues/1862.html">1862</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Determining &#8220;corresponding members&#8221; for friendship</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1863">
     <td><a href="https://cplusplus.github.io/CWG/issues/1863.html">1863</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD4</td>
     <td>Requirements on thrown object type to support <TT>std::current_exception()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1864">
     <td><a href="https://cplusplus.github.io/CWG/issues/1864.html">1864</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>List-initialization of array objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1865">
     <td><a href="https://cplusplus.github.io/CWG/issues/1865.html">1865</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD4</td>
     <td>Pointer arithmetic and multi-level qualification conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1866">
     <td><a href="https://cplusplus.github.io/CWG/issues/1866.html">1866</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>CD4</td>
     <td>Initializing variant members with non-trivial destructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1867">
     <td><a href="https://cplusplus.github.io/CWG/issues/1867.html">1867</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Function/expression ambiguity with qualified parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1868">
     <td><a href="https://cplusplus.github.io/CWG/issues/1868.html">1868</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>open</td>
     <td>Meaning of &#8220;placeholder type&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1869">
     <td><a href="https://cplusplus.github.io/CWG/issues/1869.html">1869</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>NAD</td>
     <td><TT>thread_local</TT> vs <I>linkage-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1870">
     <td><a href="https://cplusplus.github.io/CWG/issues/1870.html">1870</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>CD4</td>
     <td>Contradictory wording about definitions vs explicit specialization/instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1871">
     <td><a href="https://cplusplus.github.io/CWG/issues/1871.html">1871</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Non-identifier characters in <I>ud-suffix</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1872">
     <td><a href="https://cplusplus.github.io/CWG/issues/1872.html">1872</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td>Instantiations of <TT>constexpr</TT> templates that cannot appear in constant expressions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="1873">
     <td><a href="https://cplusplus.github.io/CWG/issues/1873.html">1873</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>CD4</td>
     <td>Protected member access from derived class friends</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1874">
     <td><a href="https://cplusplus.github.io/CWG/issues/1874.html">1874</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD4</td>
     <td>Type vs non-type template parameters with <TT>class</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1875">
     <td><a href="https://cplusplus.github.io/CWG/issues/1875.html">1875</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD4</td>
     <td>Reordering declarations in class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1876">
     <td><a href="https://cplusplus.github.io/CWG/issues/1876.html">1876</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Preventing explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1877">
     <td><a href="https://cplusplus.github.io/CWG/issues/1877.html">1877</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td>Return type deduction from <TT>return</TT> with no operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1878">
     <td><a href="https://cplusplus.github.io/CWG/issues/1878.html">1878</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>operator auto</TT> template</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="1879">
     <td><a href="https://cplusplus.github.io/CWG/issues/1879.html">1879</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>NAD</td>
     <td>Inadequate definition of alignment requirement</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1880">
     <td><a href="https://cplusplus.github.io/CWG/issues/1880.html">1880</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>When are parameter objects destroyed?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1881">
     <td><a href="https://cplusplus.github.io/CWG/issues/1881.html">1881</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Standard-layout classes and unnamed bit-fields</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="1882">
     <td><a href="https://cplusplus.github.io/CWG/issues/1882.html">1882</a></td>
+    <td>[<a href="https://wg21.link/global.names">global.names</a>]</td>
     <td>CD4</td>
     <td>Reserved names without library use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1883">
     <td><a href="https://cplusplus.github.io/CWG/issues/1883.html">1883</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>review</td>
     <td>Protected access to constructors in <I>mem-initializer</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1884">
     <td><a href="https://cplusplus.github.io/CWG/issues/1884.html">1884</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>Unclear requirements for same-named external-linkage entities</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="1885">
     <td><a href="https://cplusplus.github.io/CWG/issues/1885.html">1885</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>Return value of a function is underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1886">
     <td><a href="https://cplusplus.github.io/CWG/issues/1886.html">1886</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD4</td>
     <td>Language linkage for <TT>main()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1887">
     <td><a href="https://cplusplus.github.io/CWG/issues/1887.html">1887</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD4</td>
     <td>Problems with <TT>::</TT> as <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1888">
     <td><a href="https://cplusplus.github.io/CWG/issues/1888.html">1888</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD4</td>
     <td>Implicitly-declared default constructors and <TT>explicit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1889">
     <td><a href="https://cplusplus.github.io/CWG/issues/1889.html">1889</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma">cpp.pragma</a>]</td>
     <td>open</td>
     <td>Unclear effect of <TT>#pragma</TT> on conformance</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1890">
     <td><a href="https://cplusplus.github.io/CWG/issues/1890.html">1890</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>drafting</td>
     <td>Member type depending on definition of member function</td>
     <td align="center">
@@ -11175,2670 +13023,3115 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="1891">
     <td><a href="https://cplusplus.github.io/CWG/issues/1891.html">1891</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD4</td>
     <td>Move constructor/assignment for closure class</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="1892">
     <td><a href="https://cplusplus.github.io/CWG/issues/1892.html">1892</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td>Use of <TT>auto</TT> in function type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1893">
     <td><a href="https://cplusplus.github.io/CWG/issues/1893.html">1893</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td>Function-style cast with <I>braced-init-list</I>s and empty pack expansions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1894">
     <td><a href="https://cplusplus.github.io/CWG/issues/1894.html">1894</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td><I>typedef-name</I>s and <I>using-declaration</I>s</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="1895">
     <td><a href="https://cplusplus.github.io/CWG/issues/1895.html">1895</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Deleted conversions in conditional operator operands</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1896">
     <td><a href="https://cplusplus.github.io/CWG/issues/1896.html">1896</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>CD6</td>
     <td>Repeated alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1897">
     <td><a href="https://cplusplus.github.io/CWG/issues/1897.html">1897</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>review</td>
     <td>ODR vs alternative tokens</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1898">
     <td><a href="https://cplusplus.github.io/CWG/issues/1898.html">1898</a></td>
+    <td>[<a href="https://wg21.link/over.dcl">over.dcl</a>]</td>
     <td>CD6</td>
     <td>Use of &#8220;equivalent&#8221; in overload resolution</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1899">
     <td><a href="https://cplusplus.github.io/CWG/issues/1899.html">1899</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Value-dependent constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1900">
     <td><a href="https://cplusplus.github.io/CWG/issues/1900.html">1900</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD6</td>
     <td>Do <TT>friend</TT> declarations count as &#8220;previous declarations&#8221;?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="1901">
     <td><a href="https://cplusplus.github.io/CWG/issues/1901.html">1901</a></td>
+    <td>[<a href="https://wg21.link/lex.token">lex.token</a>]</td>
     <td>open</td>
     <td><I>punctuator</I> referenced but not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1902">
     <td><a href="https://cplusplus.github.io/CWG/issues/1902.html">1902</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD4</td>
     <td>What makes a conversion &#8220;otherwise ill-formed&#8221;?</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1903">
     <td><a href="https://cplusplus.github.io/CWG/issues/1903.html">1903</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD4</td>
     <td>What declarations are introduced by a non-member <I>using-declaration</I>?</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="1904">
     <td><a href="https://cplusplus.github.io/CWG/issues/1904.html">1904</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Default template arguments for members of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1905">
     <td><a href="https://cplusplus.github.io/CWG/issues/1905.html">1905</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>NAD</td>
     <td>Dependent types and injected-class-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1906">
     <td><a href="https://cplusplus.github.io/CWG/issues/1906.html">1906</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Name lookup in member <TT>friend</TT> declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1907">
     <td><a href="https://cplusplus.github.io/CWG/issues/1907.html">1907</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I>s and default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1908">
     <td><a href="https://cplusplus.github.io/CWG/issues/1908.html">1908</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.classref">basic.lookup.classref</a>]</td>
     <td>CD6</td>
     <td>Dual destructor lookup and <I>template-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1909">
     <td><a href="https://cplusplus.github.io/CWG/issues/1909.html">1909</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Member class template with the same name as the class</td>
     <td class="full" align="center">Clang 3.7</td>
   </tr>
   <tr id="1910">
     <td><a href="https://cplusplus.github.io/CWG/issues/1910.html">1910</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD5</td>
     <td>&#8220;Shall&#8221; requirement applied to runtime behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1911">
     <td><a href="https://cplusplus.github.io/CWG/issues/1911.html">1911</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td><TT>constexpr</TT> constructor with non-literal base class</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1912">
     <td><a href="https://cplusplus.github.io/CWG/issues/1912.html">1912</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD5</td>
     <td><I>exception-specification</I> of defaulted function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1913">
     <td><a href="https://cplusplus.github.io/CWG/issues/1913.html">1913</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD5</td>
     <td><TT>decltype((x))</TT> in <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1914">
     <td><a href="https://cplusplus.github.io/CWG/issues/1914.html">1914</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>extension</td>
     <td>Duplicate standard attributes</td>
     <td align="center">Extension</td>
   </tr>
   <tr class="open" id="1915">
     <td><a href="https://cplusplus.github.io/CWG/issues/1915.html">1915</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Potentially-invoked destructors in non-throwing constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1916">
     <td><a href="https://cplusplus.github.io/CWG/issues/1916.html">1916</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>&#8220;Same cv-unqualified type&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1917">
     <td><a href="https://cplusplus.github.io/CWG/issues/1917.html">1917</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>NAD</td>
     <td>decltype-qualified enumeration names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1918">
     <td><a href="https://cplusplus.github.io/CWG/issues/1918.html">1918</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td><TT>friend</TT> templates with dependent scopes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="1919">
     <td><a href="https://cplusplus.github.io/CWG/issues/1919.html">1919</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Overload resolution for <TT>!</TT> with explicit conversion operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1920">
     <td><a href="https://cplusplus.github.io/CWG/issues/1920.html">1920</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>CD4</td>
     <td>Qualification mismatch in <I>pseudo-destructor-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1921">
     <td><a href="https://cplusplus.github.io/CWG/issues/1921.html">1921</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> constructors and point of initialization of <TT>const</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1922">
     <td><a href="https://cplusplus.github.io/CWG/issues/1922.html">1922</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>CD4</td>
     <td>Injected class template names and default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1923">
     <td><a href="https://cplusplus.github.io/CWG/issues/1923.html">1923</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>NAD</td>
     <td>Lvalues of type <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1924">
     <td><a href="https://cplusplus.github.io/CWG/issues/1924.html">1924</a></td>
+    <td>[<a href="https://wg21.link/lex.literal">lex.literal</a>]</td>
     <td>review</td>
     <td>Definition of &#8220;literal&#8221; and kinds of literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1925">
     <td><a href="https://cplusplus.github.io/CWG/issues/1925.html">1925</a></td>
+    <td>[<a href="https://wg21.link/expr.comma">expr.comma</a>]</td>
     <td>CD4</td>
     <td>Bit-field prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1926">
     <td><a href="https://cplusplus.github.io/CWG/issues/1926.html">1926</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Potential results of subscript operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1927">
     <td><a href="https://cplusplus.github.io/CWG/issues/1927.html">1927</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>dup</td>
     <td>Lifetime of temporaries in <I>init-capture</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1928">
     <td><a href="https://cplusplus.github.io/CWG/issues/1928.html">1928</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Triviality of deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1929">
     <td><a href="https://cplusplus.github.io/CWG/issues/1929.html">1929</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>CD4</td>
     <td><TT>template</TT> keyword following namespace <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1930">
     <td><a href="https://cplusplus.github.io/CWG/issues/1930.html">1930</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>CD4</td>
     <td><I>init-declarator-list</I> vs <I>member-declarator-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1931">
     <td><a href="https://cplusplus.github.io/CWG/issues/1931.html">1931</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD5</td>
     <td>Default-constructible and copy-assignable closure types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1932">
     <td><a href="https://cplusplus.github.io/CWG/issues/1932.html">1932</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD4</td>
     <td>Bit-field results of conditional operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1933">
     <td><a href="https://cplusplus.github.io/CWG/issues/1933.html">1933</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>NAD</td>
     <td>Implementation limit for <I>initializer-list</I> elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1934">
     <td><a href="https://cplusplus.github.io/CWG/issues/1934.html">1934</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Relaxing <I>exception-specification</I> compatibility requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1935">
     <td><a href="https://cplusplus.github.io/CWG/issues/1935.html">1935</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Reuse of placement arguments in deallocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1936">
     <td><a href="https://cplusplus.github.io/CWG/issues/1936.html">1936</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD6</td>
     <td>Dependent <I>qualified-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1937">
     <td><a href="https://cplusplus.github.io/CWG/issues/1937.html">1937</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD5</td>
     <td>Incomplete specification of function pointer from lambda</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1938">
     <td><a href="https://cplusplus.github.io/CWG/issues/1938.html">1938</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance">intro.compliance</a>]</td>
     <td>CD5</td>
     <td>Should hosted/freestanding be implementation-defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1939">
     <td><a href="https://cplusplus.github.io/CWG/issues/1939.html">1939</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Argument conversions to nondeduced parameter types revisited</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1940">
     <td><a href="https://cplusplus.github.io/CWG/issues/1940.html">1940</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD4</td>
     <td><TT>static_assert</TT> in anonymous unions</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1941">
     <td><a href="https://cplusplus.github.io/CWG/issues/1941.html">1941</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>SFINAE and inherited constructor default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1942">
     <td><a href="https://cplusplus.github.io/CWG/issues/1942.html">1942</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>CD4</td>
     <td>Incorrect reference to <I>trailing-return-type</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1943">
     <td><a href="https://cplusplus.github.io/CWG/issues/1943.html">1943</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Unspecified meaning of &#8220;bit&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1944">
     <td><a href="https://cplusplus.github.io/CWG/issues/1944.html">1944</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>open</td>
     <td>New C incompatibilities</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1945">
     <td><a href="https://cplusplus.github.io/CWG/issues/1945.html">1945</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Friend declarations naming members of class templates in non-templates</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1946">
     <td><a href="https://cplusplus.github.io/CWG/issues/1946.html">1946</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s vs pointer dereference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1947">
     <td><a href="https://cplusplus.github.io/CWG/issues/1947.html">1947</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>NAD</td>
     <td>Digit separators following non-octal prefix</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1948">
     <td><a href="https://cplusplus.github.io/CWG/issues/1948.html">1948</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic">basic.stc.dynamic</a>]</td>
     <td>NAD</td>
     <td><I>exception-specification</I> of replacement global <TT>new</TT></td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr id="1949">
     <td><a href="https://cplusplus.github.io/CWG/issues/1949.html">1949</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD4</td>
     <td>&#8220;sequenced after&#8221; instead of &#8220;sequenced before&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1950">
     <td><a href="https://cplusplus.github.io/CWG/issues/1950.html">1950</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>NAD</td>
     <td>Restructuring description of ranks of conversion sequences</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1951">
     <td><a href="https://cplusplus.github.io/CWG/issues/1951.html">1951</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD4</td>
     <td>Cv-qualification and literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1952">
     <td><a href="https://cplusplus.github.io/CWG/issues/1952.html">1952</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Constant expressions and library undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1953">
     <td><a href="https://cplusplus.github.io/CWG/issues/1953.html">1953</a></td>
+    <td>[<a href="https://wg21.link/intro.memory">intro.memory</a>]</td>
     <td>CD7</td>
     <td>Data races and common initial sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1954">
     <td><a href="https://cplusplus.github.io/CWG/issues/1954.html">1954</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>CD7</td>
     <td><TT>typeid</TT> null dereference check in subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1955">
     <td><a href="https://cplusplus.github.io/CWG/issues/1955.html">1955</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD4</td>
     <td><TT>#elif</TT> with invalid controlling expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1956">
     <td><a href="https://cplusplus.github.io/CWG/issues/1956.html">1956</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.auto">basic.stc.auto</a>]</td>
     <td>CD4</td>
     <td>Reuse of storage of automatic variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1957">
     <td><a href="https://cplusplus.github.io/CWG/issues/1957.html">1957</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td><TT>decltype(auto)</TT> with direct-list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1958">
     <td><a href="https://cplusplus.github.io/CWG/issues/1958.html">1958</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> with parenthesized initializer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1959">
     <td><a href="https://cplusplus.github.io/CWG/issues/1959.html">1959</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inadvertently inherited copy constructor</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1960">
     <td><a href="https://cplusplus.github.io/CWG/issues/1960.html">1960</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>NAD</td>
     <td>Visibility of entity named in class-scope <I>using-declaration</I></td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1961">
     <td><a href="https://cplusplus.github.io/CWG/issues/1961.html">1961</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++17</td>
     <td>Potentially-concurrent actions within a signal handler</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1962">
     <td><a href="https://cplusplus.github.io/CWG/issues/1962.html">1962</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td>Type of <TT>__func__</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1963">
     <td><a href="https://cplusplus.github.io/CWG/issues/1963.html">1963</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD4</td>
     <td>Implementation-defined identifier characters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1964">
     <td><a href="https://cplusplus.github.io/CWG/issues/1964.html">1964</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>NAD</td>
     <td><I>opaque-enum-declaration</I> in <I>alias-declaration</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1965">
     <td><a href="https://cplusplus.github.io/CWG/issues/1965.html">1965</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD7</td>
     <td>Explicit casts to reference types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1966">
     <td><a href="https://cplusplus.github.io/CWG/issues/1966.html">1966</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Colon following enumeration <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="1967">
     <td><a href="https://cplusplus.github.io/CWG/issues/1967.html">1967</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>CD4</td>
     <td>Temporary lifetime and move-elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1968">
     <td><a href="https://cplusplus.github.io/CWG/issues/1968.html">1968</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Address of <TT>typeid</TT> in constant expressions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="1969">
     <td><a href="https://cplusplus.github.io/CWG/issues/1969.html">1969</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD6</td>
     <td>Missing exclusion of <TT>~S</TT> as an ordinary function name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1970">
     <td><a href="https://cplusplus.github.io/CWG/issues/1970.html">1970</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>NAD</td>
     <td>Ambiguity resolution for <TT>(T())*x</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1971">
     <td><a href="https://cplusplus.github.io/CWG/issues/1971.html">1971</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD4</td>
     <td>Unclear disambiguation of destructor and <TT>operator~</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1972">
     <td><a href="https://cplusplus.github.io/CWG/issues/1972.html">1972</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD6</td>
     <td>Identifier character restrictions in non-<I>identifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1973">
     <td><a href="https://cplusplus.github.io/CWG/issues/1973.html">1973</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Which <I>parameter-declaration-clause</I> in a <I>lambda-expression</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1974">
     <td><a href="https://cplusplus.github.io/CWG/issues/1974.html">1974</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>NAD</td>
     <td>Redundant specification of non-type <I>typename-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1975">
     <td><a href="https://cplusplus.github.io/CWG/issues/1975.html">1975</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Permissible declarations for <I>exception-specification</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1976">
     <td><a href="https://cplusplus.github.io/CWG/issues/1976.html">1976</a></td>
+    <td>[<a href="https://wg21.link/namespace.alias">namespace.alias</a>]</td>
     <td>NAD</td>
     <td>Ambiguity of <I>namespace-alias</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1977">
     <td><a href="https://cplusplus.github.io/CWG/issues/1977.html">1977</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Contradictory results of failed destructor lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1978">
     <td><a href="https://cplusplus.github.io/CWG/issues/1978.html">1978</a></td>
+    <td>[<a href="https://wg21.link/class.conv.ctor">class.conv.ctor</a>]</td>
     <td>CD4</td>
     <td>Redundant description of explicit constructor use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1979">
     <td><a href="https://cplusplus.github.io/CWG/issues/1979.html">1979</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Alias template specialization in template member definition</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="1980">
     <td><a href="https://cplusplus.github.io/CWG/issues/1980.html">1980</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>Equivalent but not functionally-equivalent redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1981">
     <td><a href="https://cplusplus.github.io/CWG/issues/1981.html">1981</a></td>
+    <td>[<a href="https://wg21.link/conv">conv</a>]</td>
     <td>CD4</td>
     <td>Implicit contextual conversions and <TT>explicit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1982">
     <td><a href="https://cplusplus.github.io/CWG/issues/1982.html">1982</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Deduction extending parameter pack</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1983">
     <td><a href="https://cplusplus.github.io/CWG/issues/1983.html">1983</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td>Inappropriate use of <I>virt-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1984">
     <td><a href="https://cplusplus.github.io/CWG/issues/1984.html">1984</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>NAD</td>
     <td>Lossless narrowing conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1985">
     <td><a href="https://cplusplus.github.io/CWG/issues/1985.html">1985</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Unknown bound array member with <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1986">
     <td><a href="https://cplusplus.github.io/CWG/issues/1986.html">1986</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>drafting</td>
     <td>odr-use and delayed initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1987">
     <td><a href="https://cplusplus.github.io/CWG/issues/1987.html">1987</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> static data members across translation units</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1988">
     <td><a href="https://cplusplus.github.io/CWG/issues/1988.html">1988</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Ambiguity between dependent and non-dependent bases in implicit member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1989">
     <td><a href="https://cplusplus.github.io/CWG/issues/1989.html">1989</a></td>
+    <td>[<a href="https://wg21.link/over.oper">over.oper</a>]</td>
     <td>drafting</td>
     <td>Insufficient restrictions on parameters of postfix operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1990">
     <td><a href="https://cplusplus.github.io/CWG/issues/1990.html">1990</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD4</td>
     <td>Ambiguity due to optional <I>decl-specifier-seq</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1991">
     <td><a href="https://cplusplus.github.io/CWG/issues/1991.html">1991</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor">class.inhctor</a>]</td>
     <td>CD4</td>
     <td>Inheriting constructors vs default arguments</td>
     <td class="full" align="center">Clang 3.9</td>
   </tr>
   <tr id="1992">
     <td><a href="https://cplusplus.github.io/CWG/issues/1992.html">1992</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td><TT>new (std::nothrow) int[N]</TT> can throw</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1993">
     <td><a href="https://cplusplus.github.io/CWG/issues/1993.html">1993</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>open</td>
     <td>Use of <TT>template&lt;&gt;</TT> defining member of explicit specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1994">
     <td><a href="https://cplusplus.github.io/CWG/issues/1994.html">1994</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>dup</td>
     <td>Confusing wording regarding multiple <TT>template&lt;&gt;</TT> prefixes</td>
     <td class="unknown" align="center">Duplicate of <a href="#529">529</a></td>
   </tr>
   <tr id="1995">
     <td><a href="https://cplusplus.github.io/CWG/issues/1995.html">1995</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="1996">
     <td><a href="https://cplusplus.github.io/CWG/issues/1996.html">1996</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>drafting</td>
     <td>Reference list-initialization ignores conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="1997">
     <td><a href="https://cplusplus.github.io/CWG/issues/1997.html">1997</a></td>
+    <td>[<a href="https://wg21.link/basic.indet">basic.indet</a>]</td>
     <td>CD7</td>
     <td>Placement new and previous initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1998">
     <td><a href="https://cplusplus.github.io/CWG/issues/1998.html">1998</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>NAD</td>
     <td>Additional sources of xvalue expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="1999">
     <td><a href="https://cplusplus.github.io/CWG/issues/1999.html">1999</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD4</td>
     <td>Representation of source characters as universal-character-names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2000">
     <td><a href="https://cplusplus.github.io/CWG/issues/2000.html">2000</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>CD4</td>
     <td><I>header-name</I> outside <TT>#include</TT> directive</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2001">
     <td><a href="https://cplusplus.github.io/CWG/issues/2001.html">2001</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD4</td>
     <td><I>non-directive</I> is underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2002">
     <td><a href="https://cplusplus.github.io/CWG/issues/2002.html">2002</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>open</td>
     <td>White space within preprocessing directives</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2003">
     <td><a href="https://cplusplus.github.io/CWG/issues/2003.html">2003</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace">cpp.replace</a>]</td>
     <td>drafting</td>
     <td>Zero-argument macros incorrectly specified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2004">
     <td><a href="https://cplusplus.github.io/CWG/issues/2004.html">2004</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Unions with mutable members in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2005">
     <td><a href="https://cplusplus.github.io/CWG/issues/2005.html">2005</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Incorrect <TT>constexpr</TT> reference initialization requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2006">
     <td><a href="https://cplusplus.github.io/CWG/issues/2006.html">2006</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD4</td>
     <td>Cv-qualified <TT>void</TT> types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2007">
     <td><a href="https://cplusplus.github.io/CWG/issues/2007.html">2007</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD6</td>
     <td>Argument-dependent lookup for <TT>operator=</TT></td>
     <td class="full" align="center">Clang 3.4</td>
   </tr>
   <tr id="2008">
     <td><a href="https://cplusplus.github.io/CWG/issues/2008.html">2008</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>CD4</td>
     <td>Default <I>template-argument</I>s underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2009">
     <td><a href="https://cplusplus.github.io/CWG/issues/2009.html">2009</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD6</td>
     <td>Unclear specification of class scope</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2010">
     <td><a href="https://cplusplus.github.io/CWG/issues/2010.html">2010</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td><I>exception-specification</I>s and conversion operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2011">
     <td><a href="https://cplusplus.github.io/CWG/issues/2011.html">2011</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Unclear effect of reference capture of reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2012">
     <td><a href="https://cplusplus.github.io/CWG/issues/2012.html">2012</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD4</td>
     <td>Lifetime of references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2013">
     <td><a href="https://cplusplus.github.io/CWG/issues/2013.html">2013</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>drafting</td>
     <td>Pointer subtraction in large array</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2014">
     <td><a href="https://cplusplus.github.io/CWG/issues/2014.html">2014</a></td>
+    <td>[<a href="https://wg21.link/new.delete.array">new.delete.array</a>]</td>
     <td>NAD</td>
     <td>Unneeded deallocation signatures</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2015">
     <td><a href="https://cplusplus.github.io/CWG/issues/2015.html">2015</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.delete">dcl.fct.def.delete</a>]</td>
     <td>CD4</td>
     <td>odr-use of deleted virtual functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2016">
     <td><a href="https://cplusplus.github.io/CWG/issues/2016.html">2016</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD4</td>
     <td>Confusing wording in description of conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2017">
     <td><a href="https://cplusplus.github.io/CWG/issues/2017.html">2017</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD4</td>
     <td>Flowing off end is not equivalent to no-expression return</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2018">
     <td><a href="https://cplusplus.github.io/CWG/issues/2018.html">2018</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>dup</td>
     <td>Qualification conversion vs reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2019">
     <td><a href="https://cplusplus.github.io/CWG/issues/2019.html">2019</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.general">basic.stc.general</a>]</td>
     <td>CD4</td>
     <td>Member references omitted from description of storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2020">
     <td><a href="https://cplusplus.github.io/CWG/issues/2020.html">2020</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Inadequate description of odr-use of implicitly-invoked functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2021">
     <td><a href="https://cplusplus.github.io/CWG/issues/2021.html">2021</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>dup</td>
     <td>Function template redeclaration via alias template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2022">
     <td><a href="https://cplusplus.github.io/CWG/issues/2022.html">2022</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Copy elision in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2023">
     <td><a href="https://cplusplus.github.io/CWG/issues/2023.html">2023</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>drafting</td>
     <td>Composite reference result type of conditional operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2024">
     <td><a href="https://cplusplus.github.io/CWG/issues/2024.html">2024</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD4</td>
     <td>Dependent types and unexpanded parameter packs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2025">
     <td><a href="https://cplusplus.github.io/CWG/issues/2025.html">2025</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>dup</td>
     <td>Declaration matching via alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2026">
     <td><a href="https://cplusplus.github.io/CWG/issues/2026.html">2026</a></td>
+    <td>[<a href="https://wg21.link/basic.start">basic.start</a>]</td>
     <td>CD4</td>
     <td>Zero-initialization and <TT>constexpr</TT></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2027">
     <td><a href="https://cplusplus.github.io/CWG/issues/2027.html">2027</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>CD4</td>
     <td>Unclear requirements for multiple <TT>alignas</TT> specifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2028">
     <td><a href="https://cplusplus.github.io/CWG/issues/2028.html">2028</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>drafting</td>
     <td>Converting constructors in rvalue reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2029">
     <td><a href="https://cplusplus.github.io/CWG/issues/2029.html">2029</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>dup</td>
     <td>Abstract class return type in <TT>decltype</TT> operand</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2030">
     <td><a href="https://cplusplus.github.io/CWG/issues/2030.html">2030</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>NAD</td>
     <td>Access of injected-class-name with template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2031">
     <td><a href="https://cplusplus.github.io/CWG/issues/2031.html">2031</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.expr">diff.cpp03.expr</a>]</td>
     <td>CD4</td>
     <td>Missing incompatibility for <TT>&amp;&amp;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2032">
     <td><a href="https://cplusplus.github.io/CWG/issues/2032.html">2032</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD4</td>
     <td>Default <I>template-argument</I>s of variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2033">
     <td><a href="https://cplusplus.github.io/CWG/issues/2033.html">2033</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>CD4</td>
     <td>Redundant restriction on partial specialization argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2034">
     <td><a href="https://cplusplus.github.io/CWG/issues/2034.html">2034</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>NAD</td>
     <td>Deprecating <TT>uncaught_exception()</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2035">
     <td><a href="https://cplusplus.github.io/CWG/issues/2035.html">2035</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.match">temp.spec.partial.match</a>]</td>
     <td>CD3</td>
     <td>Multi-section example is confusing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2036">
     <td><a href="https://cplusplus.github.io/CWG/issues/2036.html">2036</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>NAD</td>
     <td>Refactoring <I>parameters-and-qualifiers</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2037">
     <td><a href="https://cplusplus.github.io/CWG/issues/2037.html">2037</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>drafting</td>
     <td>Alias templates and template declaration matching</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2038">
     <td><a href="https://cplusplus.github.io/CWG/issues/2038.html">2038</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp14">diff.cpp14</a>]</td>
     <td>CD4</td>
     <td>Document C++14 incompatibility of new braced deduction rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2039">
     <td><a href="https://cplusplus.github.io/CWG/issues/2039.html">2039</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Constant conversions to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2040">
     <td><a href="https://cplusplus.github.io/CWG/issues/2040.html">2040</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl">dcl.decl</a>]</td>
     <td>CD4</td>
     <td><I>trailing-return-type</I> no longer ambiguous</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2041">
     <td><a href="https://cplusplus.github.io/CWG/issues/2041.html">2041</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD4</td>
     <td>Namespace for explicit class template specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2042">
     <td><a href="https://cplusplus.github.io/CWG/issues/2042.html">2042</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>review</td>
     <td>Exceptions and deallocation functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2043">
     <td><a href="https://cplusplus.github.io/CWG/issues/2043.html">2043</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>drafting</td>
     <td>Generalized template arguments and array-to-pointer decay</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2044">
     <td><a href="https://cplusplus.github.io/CWG/issues/2044.html">2044</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD4</td>
     <td><TT>decltype(auto)</TT> and <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2045">
     <td><a href="https://cplusplus.github.io/CWG/issues/2045.html">2045</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>CD5</td>
     <td>&#8220;Identical&#8221; template parameter lists</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2046">
     <td><a href="https://cplusplus.github.io/CWG/issues/2046.html">2046</a></td>
+    <td>[<a href="https://wg21.link/intro.multithread">intro.multithread</a>]</td>
     <td>C++17</td>
     <td>Incomplete thread specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2047">
     <td><a href="https://cplusplus.github.io/CWG/issues/2047.html">2047</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD4</td>
     <td>Coordinating &#8220;throws anything&#8221; specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2048">
     <td><a href="https://cplusplus.github.io/CWG/issues/2048.html">2048</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>open</td>
     <td>C-style casts that cast away constness vs <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2049">
     <td><a href="https://cplusplus.github.io/CWG/issues/2049.html">2049</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD7</td>
     <td>List initializer in non-type template default argument</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2050">
     <td><a href="https://cplusplus.github.io/CWG/issues/2050.html">2050</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>NAD</td>
     <td>Consolidate specification of linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2051">
     <td><a href="https://cplusplus.github.io/CWG/issues/2051.html">2051</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD5</td>
     <td>Simplifying alias rules</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2052">
     <td><a href="https://cplusplus.github.io/CWG/issues/2052.html">2052</a></td>
+    <td>[<a href="https://wg21.link/over.oper">over.oper</a>]</td>
     <td>CD4</td>
     <td>Template argument deduction vs overloaded operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2053">
     <td><a href="https://cplusplus.github.io/CWG/issues/2053.html">2053</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++20</td>
     <td><TT>auto</TT> in non-generic lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2054">
     <td><a href="https://cplusplus.github.io/CWG/issues/2054.html">2054</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD7</td>
     <td>Missing description of class SFINAE</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2055">
     <td><a href="https://cplusplus.github.io/CWG/issues/2055.html">2055</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicitly-specified non-deduced parameter packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2056">
     <td><a href="https://cplusplus.github.io/CWG/issues/2056.html">2056</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Member function calls in partially-initialized class objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2057">
     <td><a href="https://cplusplus.github.io/CWG/issues/2057.html">2057</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>drafting</td>
     <td>Template template arguments with default arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2058">
     <td><a href="https://cplusplus.github.io/CWG/issues/2058.html">2058</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD6</td>
     <td>More errors from internal-linkage namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2059">
     <td><a href="https://cplusplus.github.io/CWG/issues/2059.html">2059</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD5</td>
     <td>Linkage and deduced return types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2060">
     <td><a href="https://cplusplus.github.io/CWG/issues/2060.html">2060</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>NAD</td>
     <td>Deduced return type for explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2061">
     <td><a href="https://cplusplus.github.io/CWG/issues/2061.html">2061</a></td>
+    <td>[<a href="https://wg21.link/namespace.def">namespace.def</a>]</td>
     <td>CD4</td>
     <td>Inline namespace after simplifications</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2062">
     <td><a href="https://cplusplus.github.io/CWG/issues/2062.html">2062</a></td>
+    <td>[<a href="https://wg21.link/temp.class">temp.class</a>]</td>
     <td>CD6</td>
     <td>Class template redeclaration requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2063">
     <td><a href="https://cplusplus.github.io/CWG/issues/2063.html">2063</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD4</td>
     <td>Type/nontype hiding in class scope</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2064">
     <td><a href="https://cplusplus.github.io/CWG/issues/2064.html">2064</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>CD4</td>
     <td>Conflicting specifications for dependent <I>decltype-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2065">
     <td><a href="https://cplusplus.github.io/CWG/issues/2065.html">2065</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Current instantiation of a partial specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2066">
     <td><a href="https://cplusplus.github.io/CWG/issues/2066.html">2066</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Does type-dependent imply value-dependent?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2067">
     <td><a href="https://cplusplus.github.io/CWG/issues/2067.html">2067</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>open</td>
     <td>Generated variadic templates requiring empty pack</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2068">
     <td><a href="https://cplusplus.github.io/CWG/issues/2068.html">2068</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>When can/must a defaulted virtual destructor be defined?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2069">
     <td><a href="https://cplusplus.github.io/CWG/issues/2069.html">2069</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD4</td>
     <td>Do destructors have names?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2070">
     <td><a href="https://cplusplus.github.io/CWG/issues/2070.html">2070</a></td>
+    <td>[<a href="https://wg21.link/class.qual">class.qual</a>]</td>
     <td>CD6</td>
     <td><I>using-declaration</I> with dependent <I>nested-name-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2071">
     <td><a href="https://cplusplus.github.io/CWG/issues/2071.html">2071</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD4</td>
     <td><TT>typedef</TT> with no declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2072">
     <td><a href="https://cplusplus.github.io/CWG/issues/2072.html">2072</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>C++23</td>
     <td>Default argument instantiation for member functions of templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2073">
     <td><a href="https://cplusplus.github.io/CWG/issues/2073.html">2073</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>open</td>
     <td>Allocating memory for exception objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2074">
     <td><a href="https://cplusplus.github.io/CWG/issues/2074.html">2074</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>drafting</td>
     <td>Type-dependence of local class of function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2075">
     <td><a href="https://cplusplus.github.io/CWG/issues/2075.html">2075</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>CD4</td>
     <td>Passing short initializer lists to array reference parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2076">
     <td><a href="https://cplusplus.github.io/CWG/issues/2076.html">2076</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>CD4</td>
     <td>List-initialization of arguments for constructor parameters</td>
     <td class="full" align="center">Clang 13</td>
   </tr>
   <tr class="open" id="2077">
     <td><a href="https://cplusplus.github.io/CWG/issues/2077.html">2077</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>drafting</td>
     <td>Overload resolution and invalid rvalue-reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2078">
     <td><a href="https://cplusplus.github.io/CWG/issues/2078.html">2078</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>NAD</td>
     <td>Name lookup of <I>mem-initilizer-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2079">
     <td><a href="https://cplusplus.github.io/CWG/issues/2079.html">2079</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>CD4</td>
     <td><TT>[[</TT> appearing in a <I>balanced-token-seq</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2080">
     <td><a href="https://cplusplus.github.io/CWG/issues/2080.html">2080</a></td>
+    <td>[<a href="https://wg21.link/class.union">class.union</a>]</td>
     <td>CD5</td>
     <td>Example with empty anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2081">
     <td><a href="https://cplusplus.github.io/CWG/issues/2081.html">2081</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD5</td>
     <td>Deduced return type in redeclaration or specialization of function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2082">
     <td><a href="https://cplusplus.github.io/CWG/issues/2082.html">2082</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD4</td>
     <td>Referring to parameters in unevaluated operands of default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2083">
     <td><a href="https://cplusplus.github.io/CWG/issues/2083.html">2083</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Incorrect cases of odr-use</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2084">
     <td><a href="https://cplusplus.github.io/CWG/issues/2084.html">2084</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD4</td>
     <td>NSDMIs and deleted union default constructors</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2085">
     <td><a href="https://cplusplus.github.io/CWG/issues/2085.html">2085</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Invalid example of adding special member function via default argument</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2086">
     <td><a href="https://cplusplus.github.io/CWG/issues/2086.html">2086</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>drafting</td>
     <td>Reference odr-use vs implicit capture</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2087">
     <td><a href="https://cplusplus.github.io/CWG/issues/2087.html">2087</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>NAD</td>
     <td>Left shift of negative value by zero bits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2088">
     <td><a href="https://cplusplus.github.io/CWG/issues/2088.html">2088</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD5</td>
     <td>Late tiebreakers in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2089">
     <td><a href="https://cplusplus.github.io/CWG/issues/2089.html">2089</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>drafting</td>
     <td>Restricting selection of builtin overloaded operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2090">
     <td><a href="https://cplusplus.github.io/CWG/issues/2090.html">2090</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.temp">temp.dep.temp</a>]</td>
     <td>open</td>
     <td>Dependency via non-dependent base class</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2091">
     <td><a href="https://cplusplus.github.io/CWG/issues/2091.html">2091</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD4</td>
     <td>Deducing reference non-type template arguments</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="2092">
     <td><a href="https://cplusplus.github.io/CWG/issues/2092.html">2092</a></td>
+    <td>[<a href="https://wg21.link/temp.over">temp.over</a>]</td>
     <td>CD5</td>
     <td>Deduction failure and overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2093">
     <td><a href="https://cplusplus.github.io/CWG/issues/2093.html">2093</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>CD4</td>
     <td>Qualification conversion for pointer-to-member handler matching</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2094">
     <td><a href="https://cplusplus.github.io/CWG/issues/2094.html">2094</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++17</td>
     <td>Trivial copy/move constructor for class with volatile member</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="2095">
     <td><a href="https://cplusplus.github.io/CWG/issues/2095.html">2095</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD4</td>
     <td>Capturing rvalue references to functions by copy</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2096">
     <td><a href="https://cplusplus.github.io/CWG/issues/2096.html">2096</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>CD4</td>
     <td>Constraints on literal unions</td>
     <td class="full" align="center">Duplicate of <a href="#2598">2598</a></td>
   </tr>
   <tr class="open" id="2097">
     <td><a href="https://cplusplus.github.io/CWG/issues/2097.html">2097</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda">expr.prim.lambda</a>]</td>
     <td>extension</td>
     <td>Lambdas and <TT>noreturn</TT> attribute</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2098">
     <td><a href="https://cplusplus.github.io/CWG/issues/2098.html">2098</a></td>
+    <td>[<a href="https://wg21.link/except.uncaught">except.uncaught</a>]</td>
     <td>CD4</td>
     <td>Is <TT>uncaught_exceptions()</TT> per-thread?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2099">
     <td><a href="https://cplusplus.github.io/CWG/issues/2099.html">2099</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD4</td>
     <td>Inferring the bound of an array static data member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2100">
     <td><a href="https://cplusplus.github.io/CWG/issues/2100.html">2100</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++17</td>
     <td>Value-dependent address of static data member of class template</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2101">
     <td><a href="https://cplusplus.github.io/CWG/issues/2101.html">2101</a></td>
+    <td>[<a href="https://wg21.link/temp.dep">temp.dep</a>]</td>
     <td>CD4</td>
     <td>Incorrect description of type- and value-dependence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2102">
     <td><a href="https://cplusplus.github.io/CWG/issues/2102.html">2102</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD7</td>
     <td>Constructor checking in <I>new-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2103">
     <td><a href="https://cplusplus.github.io/CWG/issues/2103.html">2103</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Lvalue-to-rvalue conversion is irrelevant in odr-use of a reference</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2104">
     <td><a href="https://cplusplus.github.io/CWG/issues/2104.html">2104</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD4</td>
     <td>Internal-linkage <TT>constexpr</TT> references and ODR requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2105">
     <td><a href="https://cplusplus.github.io/CWG/issues/2105.html">2105</a></td>
+    <td>[<a href="https://wg21.link/temp.arg">temp.arg</a>]</td>
     <td>open</td>
     <td>When do the arguments for a parameter pack end?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2106">
     <td><a href="https://cplusplus.github.io/CWG/issues/2106.html">2106</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.type">temp.arg.type</a>]</td>
     <td>CD4</td>
     <td>Unclear restrictions on use of function-type template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2107">
     <td><a href="https://cplusplus.github.io/CWG/issues/2107.html">2107</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD4</td>
     <td>Lifetime of temporaries for default arguments in array copying</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2108">
     <td><a href="https://cplusplus.github.io/CWG/issues/2108.html">2108</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>drafting</td>
     <td>Conversions to non-class prvalues in reference initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2109">
     <td><a href="https://cplusplus.github.io/CWG/issues/2109.html">2109</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD4</td>
     <td>Value dependence underspecified</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2110">
     <td><a href="https://cplusplus.github.io/CWG/issues/2110.html">2110</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>drafting</td>
     <td>Overload resolution for base class conversion and reference/non-reference</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2111">
     <td><a href="https://cplusplus.github.io/CWG/issues/2111.html">2111</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>NAD</td>
     <td>Array temporaries in reference binding</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2112">
     <td><a href="https://cplusplus.github.io/CWG/issues/2112.html">2112</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td><TT>new auto{x}</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2113">
     <td><a href="https://cplusplus.github.io/CWG/issues/2113.html">2113</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning">dcl.meaning</a>]</td>
     <td>CD4</td>
     <td>Incompete specification of types for declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2114">
     <td><a href="https://cplusplus.github.io/CWG/issues/2114.html">2114</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp11.dcl.decl">diff.cpp11.dcl.decl</a>]</td>
     <td>CD3</td>
     <td>Missing description of incompatibility from aggregate NSDMIs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2115">
     <td><a href="https://cplusplus.github.io/CWG/issues/2115.html">2115</a></td>
+    <td>[<a href="https://wg21.link/stmt.jump">stmt.jump</a>]</td>
     <td>open</td>
     <td>Order of implicit destruction vs release of automatic storage</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2116">
     <td><a href="https://cplusplus.github.io/CWG/issues/2116.html">2116</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Direct or copy initialization for omitted aggregate initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2117">
     <td><a href="https://cplusplus.github.io/CWG/issues/2117.html">2117</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td>Explicit specializations and <TT>constexpr</TT> function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2118">
     <td><a href="https://cplusplus.github.io/CWG/issues/2118.html">2118</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>open</td>
     <td>Stateful metaprogramming via friend injection</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2119">
     <td><a href="https://cplusplus.github.io/CWG/issues/2119.html">2119</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>NAD</td>
     <td>Disambiguation of multi-level covariant return type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2120">
     <td><a href="https://cplusplus.github.io/CWG/issues/2120.html">2120</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD4</td>
     <td>Array as first non-static data member in standard-layout class</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="2121">
     <td><a href="https://cplusplus.github.io/CWG/issues/2121.html">2121</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.general">expr.prim.lambda.general</a>]</td>
     <td>CD6</td>
     <td>More flexible lambda syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2122">
     <td><a href="https://cplusplus.github.io/CWG/issues/2122.html">2122</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD4</td>
     <td>Glvalues of <TT>void</TT> type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2123">
     <td><a href="https://cplusplus.github.io/CWG/issues/2123.html">2123</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>open</td>
     <td>Omitted constant initialization of local static variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2124">
     <td><a href="https://cplusplus.github.io/CWG/issues/2124.html">2124</a></td>
+    <td>[<a href="https://wg21.link/defns.signature.member.templ">defns.signature.member.templ</a>]</td>
     <td>CD4</td>
     <td>Signature of constructor template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2125">
     <td><a href="https://cplusplus.github.io/CWG/issues/2125.html">2125</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>NAD</td>
     <td>Copy elision and comma operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2126">
     <td><a href="https://cplusplus.github.io/CWG/issues/2126.html">2126</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Lifetime-extended temporaries in constant expressions</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr class="open" id="2127">
     <td><a href="https://cplusplus.github.io/CWG/issues/2127.html">2127</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>drafting</td>
     <td>Partial specialization and nullptr</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2128">
     <td><a href="https://cplusplus.github.io/CWG/issues/2128.html">2128</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>open</td>
     <td>Imprecise rule for reference member initializer</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2129">
     <td><a href="https://cplusplus.github.io/CWG/issues/2129.html">2129</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Non-object prvalues and constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2130">
     <td><a href="https://cplusplus.github.io/CWG/issues/2130.html">2130</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Over-aligned types in <I>new-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2131">
     <td><a href="https://cplusplus.github.io/CWG/issues/2131.html">2131</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>drafting</td>
     <td>Ambiguity with <I>opaque-enum-declaration</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2132">
     <td><a href="https://cplusplus.github.io/CWG/issues/2132.html">2132</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>NAD</td>
     <td>Deprecated default generated copy constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2133">
     <td><a href="https://cplusplus.github.io/CWG/issues/2133.html">2133</a></td>
+    <td>[<a href="https://wg21.link/conv.fctptr">conv.fctptr</a>]</td>
     <td>CD5</td>
     <td>Converting <TT>std::nullptr_t</TT> to <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2134">
     <td><a href="https://cplusplus.github.io/CWG/issues/2134.html">2134</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.general">expr.prim.general</a>]</td>
     <td>NAD</td>
     <td>Objectless references to non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2135">
     <td><a href="https://cplusplus.github.io/CWG/issues/2135.html">2135</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td><I>mem-initializer</I>s for virtual bases of abstract classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2136">
     <td><a href="https://cplusplus.github.io/CWG/issues/2136.html">2136</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Argument-dependent lookup and initializer lists</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2137">
     <td><a href="https://cplusplus.github.io/CWG/issues/2137.html">2137</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD4</td>
     <td>List-initialization from object of same type</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2138">
     <td><a href="https://cplusplus.github.io/CWG/issues/2138.html">2138</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>NAD</td>
     <td>Explicit member specialization vs implicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2139">
     <td><a href="https://cplusplus.github.io/CWG/issues/2139.html">2139</a></td>
+    <td>[<a href="https://wg21.link/conv.fpint">conv.fpint</a>]</td>
     <td>NAD</td>
     <td>Floating-point requirements for integer representation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2140">
     <td><a href="https://cplusplus.github.io/CWG/issues/2140.html">2140</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD4</td>
     <td>Lvalue-to-rvalue conversion of <TT>std::nullptr_t</TT></td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2141">
     <td><a href="https://cplusplus.github.io/CWG/issues/2141.html">2141</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD4</td>
     <td>Ambiguity in <I>new-expression</I> with <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2142">
     <td><a href="https://cplusplus.github.io/CWG/issues/2142.html">2142</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>NAD</td>
     <td>Missing definition of associated classes and namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2143">
     <td><a href="https://cplusplus.github.io/CWG/issues/2143.html">2143</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>C++17</td>
     <td>Value-dependency via injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2144">
     <td><a href="https://cplusplus.github.io/CWG/issues/2144.html">2144</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD7</td>
     <td>Function/variable declaration ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2145">
     <td><a href="https://cplusplus.github.io/CWG/issues/2145.html">2145</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD4</td>
     <td>Parenthesized declarator in function definition</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2146">
     <td><a href="https://cplusplus.github.io/CWG/issues/2146.html">2146</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>CD4</td>
     <td>Scalar object vs memory location in definition of &#8220;unsequenced&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2147">
     <td><a href="https://cplusplus.github.io/CWG/issues/2147.html">2147</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD4</td>
     <td>Initializer-list arguments and pack deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2148">
     <td><a href="https://cplusplus.github.io/CWG/issues/2148.html">2148</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>drafting</td>
     <td>Thread storage duration and order of initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2149">
     <td><a href="https://cplusplus.github.io/CWG/issues/2149.html">2149</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD7</td>
     <td>Brace elision and array length deduction</td>
     <td class="full" align="center">Clang 3.1</td>
   </tr>
   <tr id="2150">
     <td><a href="https://cplusplus.github.io/CWG/issues/2150.html">2150</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD3</td>
     <td>Initializer list array lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2151">
     <td><a href="https://cplusplus.github.io/CWG/issues/2151.html">2151</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD4</td>
     <td>Exception object is not created</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2152">
     <td><a href="https://cplusplus.github.io/CWG/issues/2152.html">2152</a></td>
+    <td>[<a href="https://wg21.link/lex.ext">lex.ext</a>]</td>
     <td>NAD</td>
     <td>Can an alternative token be used as a <I>ud-suffix</I>?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2153">
     <td><a href="https://cplusplus.github.io/CWG/issues/2153.html">2153</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td><I>pure-specifier</I> in friend declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2154">
     <td><a href="https://cplusplus.github.io/CWG/issues/2154.html">2154</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD4</td>
     <td>Ambiguity of <I>pure-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2155">
     <td><a href="https://cplusplus.github.io/CWG/issues/2155.html">2155</a></td>
+    <td>[<a href="https://wg21.link/namespace.memdef">namespace.memdef</a>]</td>
     <td>C++17</td>
     <td>Defining classes and enumerations via <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2156">
     <td><a href="https://cplusplus.github.io/CWG/issues/2156.html">2156</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>CD4</td>
     <td>Definition of enumeration declared by <I>using-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2157">
     <td><a href="https://cplusplus.github.io/CWG/issues/2157.html">2157</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD4</td>
     <td>Further disambiguation of enumeration <I>elaborated-type-specifier</I></td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr class="open" id="2158">
     <td><a href="https://cplusplus.github.io/CWG/issues/2158.html">2158</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>drafting</td>
     <td>Polymorphic behavior during destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2159">
     <td><a href="https://cplusplus.github.io/CWG/issues/2159.html">2159</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>NAD</td>
     <td>Lambda capture and local <TT>thread_local</TT> variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2160">
     <td><a href="https://cplusplus.github.io/CWG/issues/2160.html">2160</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Issues with partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2161">
     <td><a href="https://cplusplus.github.io/CWG/issues/2161.html">2161</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Explicit instantiation declaration and &#8220;preceding initialization&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2162">
     <td><a href="https://cplusplus.github.io/CWG/issues/2162.html">2162</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD3</td>
     <td>Capturing <TT>this</TT> by reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2163">
     <td><a href="https://cplusplus.github.io/CWG/issues/2163.html">2163</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD4</td>
     <td>Labels in <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2164">
     <td><a href="https://cplusplus.github.io/CWG/issues/2164.html">2164</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.hiding">basic.scope.hiding</a>]</td>
     <td>CD5</td>
     <td>Name hiding and <I>using-directive</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2165">
     <td><a href="https://cplusplus.github.io/CWG/issues/2165.html">2165</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD6</td>
     <td>Namespaces, declarative regions, and translation units</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr class="open" id="2166">
     <td><a href="https://cplusplus.github.io/CWG/issues/2166.html">2166</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Unclear meaning of &#8220;undefined <TT>constexpr</TT> function&#8221;</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2167">
     <td><a href="https://cplusplus.github.io/CWG/issues/2167.html">2167</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD4</td>
     <td>Non-member references with lifetimes within the current evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2168">
     <td><a href="https://cplusplus.github.io/CWG/issues/2168.html">2168</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>review</td>
     <td>Narrowing conversions and +/- infinity</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2169">
     <td><a href="https://cplusplus.github.io/CWG/issues/2169.html">2169</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Narrowing conversions and overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2170">
     <td><a href="https://cplusplus.github.io/CWG/issues/2170.html">2170</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Unclear definition of odr-use for arrays</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2171">
     <td><a href="https://cplusplus.github.io/CWG/issues/2171.html">2171</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD4</td>
     <td>Triviality of copy constructor with less-qualified parameter</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr class="open" id="2172">
     <td><a href="https://cplusplus.github.io/CWG/issues/2172.html">2172</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>drafting</td>
     <td>Multiple exceptions with one exception object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2173">
     <td><a href="https://cplusplus.github.io/CWG/issues/2173.html">2173</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>open</td>
     <td>Partial specialization with non-deduced contexts</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2174">
     <td><a href="https://cplusplus.github.io/CWG/issues/2174.html">2174</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>C++17</td>
     <td>Unclear rules for friend definitions in templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2175">
     <td><a href="https://cplusplus.github.io/CWG/issues/2175.html">2175</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>CD4</td>
     <td>Ambiguity with attribute in conversion operator declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2176">
     <td><a href="https://cplusplus.github.io/CWG/issues/2176.html">2176</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD4</td>
     <td>Destroying the returned object when a destructor throws</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2177">
     <td><a href="https://cplusplus.github.io/CWG/issues/2177.html">2177</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Placement <TT>operator delete</TT> and parameter copies</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2178">
     <td><a href="https://cplusplus.github.io/CWG/issues/2178.html">2178</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Substitution of dependent template arguments in default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2179">
     <td><a href="https://cplusplus.github.io/CWG/issues/2179.html">2179</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>drafting</td>
     <td>Required diagnostic for partial specialization after first use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2180">
     <td><a href="https://cplusplus.github.io/CWG/issues/2180.html">2180</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>CD4</td>
     <td>Virtual bases in destructors and defaulted assignment operators</td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="2181">
     <td><a href="https://cplusplus.github.io/CWG/issues/2181.html">2181</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>C++20</td>
     <td>Normative requirements in an informative Annex</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2182">
     <td><a href="https://cplusplus.github.io/CWG/issues/2182.html">2182</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>drafting</td>
     <td>Pointer arithmetic in array-like containers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2183">
     <td><a href="https://cplusplus.github.io/CWG/issues/2183.html">2183</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Problems in description of potential exceptions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2184">
     <td><a href="https://cplusplus.github.io/CWG/issues/2184.html">2184</a></td>
+    <td>[<a href="https://wg21.link/diff.expr">diff.expr</a>]</td>
     <td>CD4</td>
     <td>Missing C compatibility entry for decrement of <TT>bool</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2185">
     <td><a href="https://cplusplus.github.io/CWG/issues/2185.html">2185</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD6</td>
     <td>Cv-qualified numeric types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2186">
     <td><a href="https://cplusplus.github.io/CWG/issues/2186.html">2186</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Unclear point that &#8220;preceding initialization&#8221; must precede</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2187">
     <td><a href="https://cplusplus.github.io/CWG/issues/2187.html">2187</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>drafting</td>
     <td>Protected members and access via <I>qualified-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2188">
     <td><a href="https://cplusplus.github.io/CWG/issues/2188.html">2188</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td><I>empty-declaration</I> grammar ambiguity</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2189">
     <td><a href="https://cplusplus.github.io/CWG/issues/2189.html">2189</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>open</td>
     <td>Surrogate call template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2190">
     <td><a href="https://cplusplus.github.io/CWG/issues/2190.html">2190</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Insufficient specification of <TT>__has_include</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2191">
     <td><a href="https://cplusplus.github.io/CWG/issues/2191.html">2191</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>C++17</td>
     <td>Incorrect result for <TT>noexcept(typeid(v))</TT></td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2192">
     <td><a href="https://cplusplus.github.io/CWG/issues/2192.html">2192</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Constant expressions and order-of-eval undefined behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2193">
     <td><a href="https://cplusplus.github.io/CWG/issues/2193.html">2193</a></td>
+    <td>[<a href="https://wg21.link/numeric.limits.members">numeric.limits.members</a>]</td>
     <td>NAD</td>
     <td><TT>numeric_limits&lt;int&gt;::radix</TT> and <TT>digits</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2194">
     <td><a href="https://cplusplus.github.io/CWG/issues/2194.html">2194</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>drafting</td>
     <td>Impossible case in list initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2195">
     <td><a href="https://cplusplus.github.io/CWG/issues/2195.html">2195</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.cv">dcl.type.cv</a>]</td>
     <td>open</td>
     <td>Unsolicited reading of trailing volatile members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2196">
     <td><a href="https://cplusplus.github.io/CWG/issues/2196.html">2196</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>C++17</td>
     <td>Zero-initialization with virtual base classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2197">
     <td><a href="https://cplusplus.github.io/CWG/issues/2197.html">2197</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>C++17</td>
     <td>Overload resolution and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2198">
     <td><a href="https://cplusplus.github.io/CWG/issues/2198.html">2198</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>C++17</td>
     <td>Linkage of enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2199">
     <td><a href="https://cplusplus.github.io/CWG/issues/2199.html">2199</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD6</td>
     <td>Typedefs and tags</td>
     <td class="full" align="center">Clang 3.8</td>
   </tr>
   <tr id="2200">
     <td><a href="https://cplusplus.github.io/CWG/issues/2200.html">2200</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>NAD</td>
     <td>Conversions in template argument deduction</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2201">
     <td><a href="https://cplusplus.github.io/CWG/issues/2201.html">2201</a></td>
+    <td>[<a href="https://wg21.link/basic.type.qualifier">basic.type.qualifier</a>]</td>
     <td>C++17</td>
     <td>Cv-qualification of array types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2202">
     <td><a href="https://cplusplus.github.io/CWG/issues/2202.html">2202</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>When does default argument instantiation occur?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2203">
     <td><a href="https://cplusplus.github.io/CWG/issues/2203.html">2203</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Defaulted copy/move constructors and UDCs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2204">
     <td><a href="https://cplusplus.github.io/CWG/issues/2204.html">2204</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>NAD</td>
     <td>Naming delegated constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2205">
     <td><a href="https://cplusplus.github.io/CWG/issues/2205.html">2205</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++17</td>
     <td>Restrictions on use of <TT>alignas</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2206">
     <td><a href="https://cplusplus.github.io/CWG/issues/2206.html">2206</a></td>
+    <td>[<a href="https://wg21.link/expr">expr</a>]</td>
     <td>C++17</td>
     <td>Composite type of object and function pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2207">
     <td><a href="https://cplusplus.github.io/CWG/issues/2207.html">2207</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>CD5</td>
     <td>Alignment of allocation function return value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2208">
     <td><a href="https://cplusplus.github.io/CWG/issues/2208.html">2208</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>NAD</td>
     <td><I>static_assert-declaration</I> does not declare a member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2209">
     <td><a href="https://cplusplus.github.io/CWG/issues/2209.html">2209</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Destruction of constructed array elements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2210">
     <td><a href="https://cplusplus.github.io/CWG/issues/2210.html">2210</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>NAD</td>
     <td>Principal/target constructor confusion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2211">
     <td><a href="https://cplusplus.github.io/CWG/issues/2211.html">2211</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Hiding by lambda captures and parameters</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="2212">
     <td><a href="https://cplusplus.github.io/CWG/issues/2212.html">2212</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>CD5</td>
     <td>Typedef changing linkage after use</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2213">
     <td><a href="https://cplusplus.github.io/CWG/issues/2213.html">2213</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD6</td>
     <td>Forward declaration of partial specializations</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2214">
     <td><a href="https://cplusplus.github.io/CWG/issues/2214.html">2214</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++17</td>
     <td>Missing requirement on representation of integer values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2215">
     <td><a href="https://cplusplus.github.io/CWG/issues/2215.html">2215</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++17</td>
     <td>Redundant description of language linkage in function call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2216">
     <td><a href="https://cplusplus.github.io/CWG/issues/2216.html">2216</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>NAD</td>
     <td>Exception specifications in unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2217">
     <td><a href="https://cplusplus.github.io/CWG/issues/2217.html">2217</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>NAD</td>
     <td><TT>constexpr</TT> constructors for non-literal types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2218">
     <td><a href="https://cplusplus.github.io/CWG/issues/2218.html">2218</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup">basic.lookup</a>]</td>
     <td>C++17</td>
     <td>Ambiguity and namespace aliases</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2219">
     <td><a href="https://cplusplus.github.io/CWG/issues/2219.html">2219</a></td>
+    <td>[<a href="https://wg21.link/except.handle">except.handle</a>]</td>
     <td>drafting</td>
     <td>Dynamically-unreachable handlers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2220">
     <td><a href="https://cplusplus.github.io/CWG/issues/2220.html">2220</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>C++17</td>
     <td>Hiding index variable in range-based <TT>for</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2221">
     <td><a href="https://cplusplus.github.io/CWG/issues/2221.html">2221</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD6</td>
     <td>Copying volatile objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2222">
     <td><a href="https://cplusplus.github.io/CWG/issues/2222.html">2222</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Additional contexts where instantiation is not required</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2223">
     <td><a href="https://cplusplus.github.io/CWG/issues/2223.html">2223</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>drafting</td>
     <td>Multiple <TT>alignas</TT> specifiers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2224">
     <td><a href="https://cplusplus.github.io/CWG/issues/2224.html">2224</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>C++17</td>
     <td>Member subobjects and base-class casts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2225">
     <td><a href="https://cplusplus.github.io/CWG/issues/2225.html">2225</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>NAD</td>
     <td><TT>reinterpret_cast</TT> to same floating-point type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2226">
     <td><a href="https://cplusplus.github.io/CWG/issues/2226.html">2226</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD5</td>
     <td>Xvalues vs lvalues in conditional expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2227">
     <td><a href="https://cplusplus.github.io/CWG/issues/2227.html">2227</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD5</td>
     <td>Destructor access and default member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2228">
     <td><a href="https://cplusplus.github.io/CWG/issues/2228.html">2228</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>review</td>
     <td>Ambiguity resolution for cast to function type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2229">
     <td><a href="https://cplusplus.github.io/CWG/issues/2229.html">2229</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Volatile unnamed bit-fields</td>
     <td class="full" align="center">Clang 7</td>
   </tr>
   <tr id="2230">
     <td><a href="https://cplusplus.github.io/CWG/issues/2230.html">2230</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>NAD</td>
     <td>Linkage of <TT>extern "C"</TT> function in unnamed namespace</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2231">
     <td><a href="https://cplusplus.github.io/CWG/issues/2231.html">2231</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>NAD</td>
     <td>Class member access to static data member template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2232">
     <td><a href="https://cplusplus.github.io/CWG/issues/2232.html">2232</a></td>
+    <td>[<a href="https://wg21.link/dcl.stc">dcl.stc</a>]</td>
     <td>open</td>
     <td><TT>thread_local</TT> anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2233">
     <td><a href="https://cplusplus.github.io/CWG/issues/2233.html">2233</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD5</td>
     <td>Function parameter packs following default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2234">
     <td><a href="https://cplusplus.github.io/CWG/issues/2234.html">2234</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD5</td>
     <td>Missing rules for <I>simple-template-id</I> as <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2235">
     <td><a href="https://cplusplus.github.io/CWG/issues/2235.html">2235</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>CD5</td>
     <td>Partial ordering and non-dependent types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2236">
     <td><a href="https://cplusplus.github.io/CWG/issues/2236.html">2236</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>drafting</td>
     <td>When is an alias template specialization dependent?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2237">
     <td><a href="https://cplusplus.github.io/CWG/issues/2237.html">2237</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD5</td>
     <td>Can a <I>template-id</I> name a constructor?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2238">
     <td><a href="https://cplusplus.github.io/CWG/issues/2238.html">2238</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.allocation">basic.stc.dynamic.allocation</a>]</td>
     <td>NAD</td>
     <td>Contradictory alignment requirements for allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2239">
     <td><a href="https://cplusplus.github.io/CWG/issues/2239.html">2239</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Sized deallocation with a trivial destructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2240">
     <td><a href="https://cplusplus.github.io/CWG/issues/2240.html">2240</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td><TT>this</TT> is not odr-used in a constant expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2241">
     <td><a href="https://cplusplus.github.io/CWG/issues/2241.html">2241</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD5</td>
     <td>Overload resolution is not invoked with a single function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2242">
     <td><a href="https://cplusplus.github.io/CWG/issues/2242.html">2242</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td>ODR violation with constant initialization possibly omitted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2243">
     <td><a href="https://cplusplus.github.io/CWG/issues/2243.html">2243</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>drafting</td>
     <td>Incorrect use of implicit conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2244">
     <td><a href="https://cplusplus.github.io/CWG/issues/2244.html">2244</a></td>
+    <td>[<a href="https://wg21.link/class.protected">class.protected</a>]</td>
     <td>open</td>
     <td>Base class access in aggregate initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2245">
     <td><a href="https://cplusplus.github.io/CWG/issues/2245.html">2245</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Point of instantiation of incomplete class template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2246">
     <td><a href="https://cplusplus.github.io/CWG/issues/2246.html">2246</a></td>
+    <td>[<a href="https://wg21.link/class.access.base">class.access.base</a>]</td>
     <td>drafting</td>
     <td>Access of indirect virtual base class constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2247">
     <td><a href="https://cplusplus.github.io/CWG/issues/2247.html">2247</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++17</td>
     <td>Lambda capture and variable argument list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2248">
     <td><a href="https://cplusplus.github.io/CWG/issues/2248.html">2248</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++17</td>
     <td>Problems with sized delete</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2249">
     <td><a href="https://cplusplus.github.io/CWG/issues/2249.html">2249</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>CD5</td>
     <td><I>identifier</I>s and <I>id-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2250">
     <td><a href="https://cplusplus.github.io/CWG/issues/2250.html">2250</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>open</td>
     <td>Implicit instantiation, destruction, and TUs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2251">
     <td><a href="https://cplusplus.github.io/CWG/issues/2251.html">2251</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++17</td>
     <td>Unreachable enumeration list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2252">
     <td><a href="https://cplusplus.github.io/CWG/issues/2252.html">2252</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Enumeration list-initialization from the same type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2253">
     <td><a href="https://cplusplus.github.io/CWG/issues/2253.html">2253</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD5</td>
     <td>Unnamed bit-fields and zero-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2254">
     <td><a href="https://cplusplus.github.io/CWG/issues/2254.html">2254</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td>Standard-layout classes and bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2255">
     <td><a href="https://cplusplus.github.io/CWG/issues/2255.html">2255</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD5</td>
     <td>Instantiated static data member templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2256">
     <td><a href="https://cplusplus.github.io/CWG/issues/2256.html">2256</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD5</td>
     <td>Lifetime of trivially-destructible objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2257">
     <td><a href="https://cplusplus.github.io/CWG/issues/2257.html">2257</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD5</td>
     <td>Lifetime extension of references vs exceptions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2258">
     <td><a href="https://cplusplus.github.io/CWG/issues/2258.html">2258</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Storage deallocation during period of destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2259">
     <td><a href="https://cplusplus.github.io/CWG/issues/2259.html">2259</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>C++17</td>
     <td>Unclear context describing ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2260">
     <td><a href="https://cplusplus.github.io/CWG/issues/2260.html">2260</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>CD5</td>
     <td>Explicit specializations of deleted member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2261">
     <td><a href="https://cplusplus.github.io/CWG/issues/2261.html">2261</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>extension</td>
     <td>Explicit instantiation of in-class <TT>friend</TT> definition</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2262">
     <td><a href="https://cplusplus.github.io/CWG/issues/2262.html">2262</a></td>
+    <td>[<a href="https://wg21.link/dcl.asm">dcl.asm</a>]</td>
     <td>C++17</td>
     <td>Attributes for <I>asm-definition</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2263">
     <td><a href="https://cplusplus.github.io/CWG/issues/2263.html">2263</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Default argument instantiation for <TT>friend</TT>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2264">
     <td><a href="https://cplusplus.github.io/CWG/issues/2264.html">2264</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>drafting</td>
     <td>Memberwise copying with indeterminate value</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2265">
     <td><a href="https://cplusplus.github.io/CWG/issues/2265.html">2265</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Delayed pack expansion and member redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2266">
     <td><a href="https://cplusplus.github.io/CWG/issues/2266.html">2266</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD5</td>
     <td>Has dependent type vs is type-dependent</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2267">
     <td><a href="https://cplusplus.github.io/CWG/issues/2267.html">2267</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD5</td>
     <td>Copy-initialization of temporary in reference direct-initialization</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2268">
     <td><a href="https://cplusplus.github.io/CWG/issues/2268.html">2268</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++17</td>
     <td>Unions with mutable members in constant expressions revisited</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2269">
     <td><a href="https://cplusplus.github.io/CWG/issues/2269.html">2269</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>dup</td>
     <td>Additional recursive references in aggregate DMIs</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2270">
     <td><a href="https://cplusplus.github.io/CWG/issues/2270.html">2270</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>NAD</td>
     <td>Non-inline functions and explicit instantiation declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2271">
     <td><a href="https://cplusplus.github.io/CWG/issues/2271.html">2271</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>C++17</td>
     <td>Aliasing <TT>this</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2272">
     <td><a href="https://cplusplus.github.io/CWG/issues/2272.html">2272</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++17</td>
     <td>Implicit initialization of aggregate members of reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2273">
     <td><a href="https://cplusplus.github.io/CWG/issues/2273.html">2273</a></td>
+    <td>[<a href="https://wg21.link/class.ctor">class.ctor</a>]</td>
     <td>CD5</td>
     <td>Inheriting constructors vs implicit default constructor</td>
     <td class="full" align="center">Clang 3.3</td>
   </tr>
   <tr id="2274">
     <td><a href="https://cplusplus.github.io/CWG/issues/2274.html">2274</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>NAD</td>
     <td>Generic lambda capture vs constexpr if</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2275">
     <td><a href="https://cplusplus.github.io/CWG/issues/2275.html">2275</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>drafting</td>
     <td>Type-dependence of function template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2276">
     <td><a href="https://cplusplus.github.io/CWG/issues/2276.html">2276</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>C++17</td>
     <td>Dependent <TT>noexcept</TT> and function type-dependence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2277">
     <td><a href="https://cplusplus.github.io/CWG/issues/2277.html">2277</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD5</td>
     <td>Ambiguity inheriting constructors with default arguments</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2278">
     <td><a href="https://cplusplus.github.io/CWG/issues/2278.html">2278</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Copy elision in constant expressions reconsidered</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2279">
     <td><a href="https://cplusplus.github.io/CWG/issues/2279.html">2279</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Multiple <I>attribute-specifier</I>s in one <I>attribute-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2280">
     <td><a href="https://cplusplus.github.io/CWG/issues/2280.html">2280</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++20</td>
     <td>Matching a usual deallocation function with placement new</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2281">
     <td><a href="https://cplusplus.github.io/CWG/issues/2281.html">2281</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>drafting</td>
     <td>Consistency of aligned <TT>operator delete</TT> replacement</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2282">
     <td><a href="https://cplusplus.github.io/CWG/issues/2282.html">2282</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>C++20</td>
     <td>Consistency with mismatched aligned/non-over-aligned allocation/deallocation functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2283">
     <td><a href="https://cplusplus.github.io/CWG/issues/2283.html">2283</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD7</td>
     <td>Missing complete type requirements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2284">
     <td><a href="https://cplusplus.github.io/CWG/issues/2284.html">2284</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Sequencing of <I>braced-init-list</I> arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2285">
     <td><a href="https://cplusplus.github.io/CWG/issues/2285.html">2285</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Issues with structured bindings</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2286">
     <td><a href="https://cplusplus.github.io/CWG/issues/2286.html">2286</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>NAD</td>
     <td>Assignment evaluation order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2287">
     <td><a href="https://cplusplus.github.io/CWG/issues/2287.html">2287</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD5</td>
     <td>Pointer-interconvertibility in non-standard-layout unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2288">
     <td><a href="https://cplusplus.github.io/CWG/issues/2288.html">2288</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>NAD</td>
     <td>Contradictory optionality in <I>simple-declaration</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2289">
     <td><a href="https://cplusplus.github.io/CWG/issues/2289.html">2289</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.declarative">basic.scope.declarative</a>]</td>
     <td>CD5</td>
     <td>Uniqueness of structured binding names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2290">
     <td><a href="https://cplusplus.github.io/CWG/issues/2290.html">2290</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs">over.match.funcs</a>]</td>
     <td>CD5</td>
     <td>Unclear specification for overload resolution and deleted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2291">
     <td><a href="https://cplusplus.github.io/CWG/issues/2291.html">2291</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>dup</td>
     <td>Implicit conversion sequences in non-call contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2292">
     <td><a href="https://cplusplus.github.io/CWG/issues/2292.html">2292</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD5</td>
     <td><I>simple-template-id</I> is ambiguous between <I>class-name</I> and <I>type-name</I></td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2293">
     <td><a href="https://cplusplus.github.io/CWG/issues/2293.html">2293</a></td>
+    <td>[<a href="https://wg21.link/class">class</a>]</td>
     <td>CD5</td>
     <td>Requirements for <I>simple-template-id</I> used as a <I>class-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2294">
     <td><a href="https://cplusplus.github.io/CWG/issues/2294.html">2294</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD5</td>
     <td>Dependent <TT>auto</TT> static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2295">
     <td><a href="https://cplusplus.github.io/CWG/issues/2295.html">2295</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD5</td>
     <td>Aggregates with deleted defaulted constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2296">
     <td><a href="https://cplusplus.github.io/CWG/issues/2296.html">2296</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>open</td>
     <td>Are default argument instantiation failures in the &#8220;immediate context&#8221;?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2297">
     <td><a href="https://cplusplus.github.io/CWG/issues/2297.html">2297</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>open</td>
     <td>Unclear specification of atomic operations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2298">
     <td><a href="https://cplusplus.github.io/CWG/issues/2298.html">2298</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>open</td>
     <td>Actions and expression evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2299">
     <td><a href="https://cplusplus.github.io/CWG/issues/2299.html">2299</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD5</td>
     <td><TT>constexpr</TT> vararg functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2300">
     <td><a href="https://cplusplus.github.io/CWG/issues/2300.html">2300</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Lambdas in multiple definitions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2301">
     <td><a href="https://cplusplus.github.io/CWG/issues/2301.html">2301</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Value-initialization and constexpr constructor evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2302">
     <td><a href="https://cplusplus.github.io/CWG/issues/2302.html">2302</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>NAD</td>
     <td>Address comparison between different member subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2303">
     <td><a href="https://cplusplus.github.io/CWG/issues/2303.html">2303</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>CD5</td>
     <td>Partial ordering and recursive variadic inheritance</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2304">
     <td><a href="https://cplusplus.github.io/CWG/issues/2304.html">2304</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>NAD</td>
     <td>Incomplete type vs overload resolution</td>
     <td class="full" align="center">Clang 2.8</td>
   </tr>
   <tr id="2305">
     <td><a href="https://cplusplus.github.io/CWG/issues/2305.html">2305</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD5</td>
     <td>Explicit instantiation of constexpr or inline variable template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2306">
     <td><a href="https://cplusplus.github.io/CWG/issues/2306.html">2306</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>NAD</td>
     <td>Nested friend templates of class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2307">
     <td><a href="https://cplusplus.github.io/CWG/issues/2307.html">2307</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD5</td>
     <td>Unclear definition of &#8220;equivalent to a nontype template parameter&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2308">
     <td><a href="https://cplusplus.github.io/CWG/issues/2308.html">2308</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>NAD</td>
     <td>Structured bindings and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2309">
     <td><a href="https://cplusplus.github.io/CWG/issues/2309.html">2309</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD5</td>
     <td>Restrictions on nested statements within <TT>constexpr</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2310">
     <td><a href="https://cplusplus.github.io/CWG/issues/2310.html">2310</a></td>
+    <td>[<a href="https://wg21.link/conv.ptr">conv.ptr</a>]</td>
     <td>CD5</td>
     <td>Type completeness and derived-to-base pointer conversions</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr class="open" id="2311">
     <td><a href="https://cplusplus.github.io/CWG/issues/2311.html">2311</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>open</td>
     <td>Missed case for guaranteed copy elision</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2312">
     <td><a href="https://cplusplus.github.io/CWG/issues/2312.html">2312</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD6</td>
     <td>Structured bindings and <TT>mutable</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2313">
     <td><a href="https://cplusplus.github.io/CWG/issues/2313.html">2313</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Redeclaration of structured binding reference variables</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2314">
     <td><a href="https://cplusplus.github.io/CWG/issues/2314.html">2314</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>dup</td>
     <td>Structured bindings and lambda capture</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2315">
     <td><a href="https://cplusplus.github.io/CWG/issues/2315.html">2315</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD5</td>
     <td>What is the &#8220;corresponding special member&#8221; of a variant member?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2316">
     <td><a href="https://cplusplus.github.io/CWG/issues/2316.html">2316</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>drafting</td>
     <td>Simplifying class conversions in conditional expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2317">
     <td><a href="https://cplusplus.github.io/CWG/issues/2317.html">2317</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>CD5</td>
     <td>Self-referential default member initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2318">
     <td><a href="https://cplusplus.github.io/CWG/issues/2318.html">2318</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD5</td>
     <td>Nondeduced contexts in deduction from a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2319">
     <td><a href="https://cplusplus.github.io/CWG/issues/2319.html">2319</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics">over.best.ics</a>]</td>
     <td>drafting</td>
     <td>Nested brace initialization from same type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2320">
     <td><a href="https://cplusplus.github.io/CWG/issues/2320.html">2320</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>extension</td>
     <td><TT>constexpr if</TT> and boolean conversions</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2321">
     <td><a href="https://cplusplus.github.io/CWG/issues/2321.html">2321</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD5</td>
     <td>Conditional operator and cv-qualified class prvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2322">
     <td><a href="https://cplusplus.github.io/CWG/issues/2322.html">2322</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD5</td>
     <td>Substitution failure and lexical order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2323">
     <td><a href="https://cplusplus.github.io/CWG/issues/2323.html">2323</a></td>
+    <td>[<a href="https://wg21.link/basic.types">basic.types</a>]</td>
     <td>C++20</td>
     <td>Expunge POD</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2324">
     <td><a href="https://cplusplus.github.io/CWG/issues/2324.html">2324</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td>Size of base class subobject</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2325">
     <td><a href="https://cplusplus.github.io/CWG/issues/2325.html">2325</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td><TT>std::launder</TT> and reuse of character buffers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2326">
     <td><a href="https://cplusplus.github.io/CWG/issues/2326.html">2326</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>dup</td>
     <td>Type deduction with initializer list containing ambiguous functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2327">
     <td><a href="https://cplusplus.github.io/CWG/issues/2327.html">2327</a></td>
+    <td>[<a href="https://wg21.link/dcl.init">dcl.init</a>]</td>
     <td>drafting</td>
     <td>Copy elision for direct-initialization with a conversion function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2328">
     <td><a href="https://cplusplus.github.io/CWG/issues/2328.html">2328</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>drafting</td>
     <td>Unclear presentation style of template argument deduction rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2329">
     <td><a href="https://cplusplus.github.io/CWG/issues/2329.html">2329</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Virtual base classes and generated assignment operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2330">
     <td><a href="https://cplusplus.github.io/CWG/issues/2330.html">2330</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>CD5</td>
     <td>Missing references to variable templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2331">
     <td><a href="https://cplusplus.github.io/CWG/issues/2331.html">2331</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.class">basic.scope.class</a>]</td>
     <td>CD6</td>
     <td>Redundancy in description of class scope</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2332">
     <td><a href="https://cplusplus.github.io/CWG/issues/2332.html">2332</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>CD5</td>
     <td><I>template-name</I> as <I>simple-type-name</I> vs injected-class-name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2333">
     <td><a href="https://cplusplus.github.io/CWG/issues/2333.html">2333</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Escape sequences in UTF-8 character literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2334">
     <td><a href="https://cplusplus.github.io/CWG/issues/2334.html">2334</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Creation of objects by <TT>typeid</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2335">
     <td><a href="https://cplusplus.github.io/CWG/issues/2335.html">2335</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>drafting</td>
     <td>Deduced return types vs member types</td>
     <td align="center">
@@ -13849,1308 +16142,1526 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2336">
     <td><a href="https://cplusplus.github.io/CWG/issues/2336.html">2336</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>CD5</td>
     <td>Destructor characteristics vs potentially-constructed subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2337">
     <td><a href="https://cplusplus.github.io/CWG/issues/2337.html">2337</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Incorrect implication of logic ladder for conversion sequence tiebreakers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2338">
     <td><a href="https://cplusplus.github.io/CWG/issues/2338.html">2338</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD5</td>
     <td>Undefined behavior converting to short enums with fixed underlying types</td>
     <td class="full" align="center">Clang 12</td>
   </tr>
   <tr id="2339">
     <td><a href="https://cplusplus.github.io/CWG/issues/2339.html">2339</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td>Underspecified template arguments in structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2340">
     <td><a href="https://cplusplus.github.io/CWG/issues/2340.html">2340</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>open</td>
     <td>Reference collapsing and structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2341">
     <td><a href="https://cplusplus.github.io/CWG/issues/2341.html">2341</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>CD5</td>
     <td>Structured bindings with static storage duration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2342">
     <td><a href="https://cplusplus.github.io/CWG/issues/2342.html">2342</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD5</td>
     <td>Reference <TT>reinterpret_cast</TT> and pointer-interconvertibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2343">
     <td><a href="https://cplusplus.github.io/CWG/issues/2343.html">2343</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>C++20</td>
     <td><TT>void*</TT> non-type template parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2344">
     <td><a href="https://cplusplus.github.io/CWG/issues/2344.html">2344</a></td>
+    <td>[<a href="https://wg21.link/stmt.select">stmt.select</a>]</td>
     <td>NAD</td>
     <td>Redeclaration of names in <I>init-statement</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2345">
     <td><a href="https://cplusplus.github.io/CWG/issues/2345.html">2345</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>CD5</td>
     <td>Jumping across initializers in <I>init-statement</I>s and <I>condition</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2346">
     <td><a href="https://cplusplus.github.io/CWG/issues/2346.html">2346</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD5</td>
     <td>Local variables in default arguments</td>
     <td class="full" align="center">Clang 11</td>
   </tr>
   <tr id="2347">
     <td><a href="https://cplusplus.github.io/CWG/issues/2347.html">2347</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++20</td>
     <td>Passing short scoped enumerations to ellipsis</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2348">
     <td><a href="https://cplusplus.github.io/CWG/issues/2348.html">2348</a></td>
+    <td>[<a href="https://wg21.link/stmt.if">stmt.if</a>]</td>
     <td>NAD</td>
     <td>Non-templated <TT>constexpr if</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2349">
     <td><a href="https://cplusplus.github.io/CWG/issues/2349.html">2349</a></td>
+    <td>[<a href="https://wg21.link/stmt">stmt</a>]</td>
     <td>NAD</td>
     <td>Class/enumeration names vs conditions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2350">
     <td><a href="https://cplusplus.github.io/CWG/issues/2350.html">2350</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>NAD</td>
     <td>Forwarding references and deduction guides</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2351">
     <td><a href="https://cplusplus.github.io/CWG/issues/2351.html">2351</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD5</td>
     <td><TT>void{}</TT></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2352">
     <td><a href="https://cplusplus.github.io/CWG/issues/2352.html">2352</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD5</td>
     <td>Similar types and reference binding</td>
     <td class="full" align="center">Clang 10</td>
   </tr>
   <tr id="2353">
     <td><a href="https://cplusplus.github.io/CWG/issues/2353.html">2353</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td>Potential results of a member access expression for a static data member</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2354">
     <td><a href="https://cplusplus.github.io/CWG/issues/2354.html">2354</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD5</td>
     <td>Extended alignment and object representation</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr id="2355">
     <td><a href="https://cplusplus.github.io/CWG/issues/2355.html">2355</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>CD6</td>
     <td>Deducing <I>noexcept-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2356">
     <td><a href="https://cplusplus.github.io/CWG/issues/2356.html">2356</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs">over.match.funcs</a>]</td>
     <td>CD5</td>
     <td>Base class copy and move constructors should not be inherited</td>
     <td class="full" align="center">Clang 4</td>
   </tr>
   <tr id="2357">
     <td><a href="https://cplusplus.github.io/CWG/issues/2357.html">2357</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>NAD</td>
     <td>Lookup in member function declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2358">
     <td><a href="https://cplusplus.github.io/CWG/issues/2358.html">2358</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>CD5</td>
     <td>Explicit capture of value</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2359">
     <td><a href="https://cplusplus.github.io/CWG/issues/2359.html">2359</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>CD5</td>
     <td>Unintended copy initialization with designated initializers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2360">
     <td><a href="https://cplusplus.github.io/CWG/issues/2360.html">2360</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.unused">dcl.attr.unused</a>]</td>
     <td>CD5</td>
     <td><TT>[[maybe_unused]]</TT> and structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2361">
     <td><a href="https://cplusplus.github.io/CWG/issues/2361.html">2361</a></td>
+    <td>[<a href="https://wg21.link/csetjmp.syn">csetjmp.syn</a>]</td>
     <td>open</td>
     <td>Unclear description of <TT>longjmp</TT> undefined behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2362">
     <td><a href="https://cplusplus.github.io/CWG/issues/2362.html">2362</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td><TT>__func__</TT> should be <TT>constexpr</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2363">
     <td><a href="https://cplusplus.github.io/CWG/issues/2363.html">2363</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>NAD</td>
     <td>Opaque enumeration friend declarations</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2364">
     <td><a href="https://cplusplus.github.io/CWG/issues/2364.html">2364</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>NAD</td>
     <td>Constant expressions, aggregate initialization, and modifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2365">
     <td><a href="https://cplusplus.github.io/CWG/issues/2365.html">2365</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD5</td>
     <td>Confusing specification for <TT>dynamic_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2366">
     <td><a href="https://cplusplus.github.io/CWG/issues/2366.html">2366</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>CD5</td>
     <td>Can default initialization be constant initialization?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2367">
     <td><a href="https://cplusplus.github.io/CWG/issues/2367.html">2367</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>NAD</td>
     <td>Lambdas in default arguments vs the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2368">
     <td><a href="https://cplusplus.github.io/CWG/issues/2368.html">2368</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Differences in relational and three-way constant comparisons</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2369">
     <td><a href="https://cplusplus.github.io/CWG/issues/2369.html">2369</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>CD6</td>
     <td>Ordering between constraints and substitution</td>
     <td class="partial" align="center">Partial</td>
   </tr>
   <tr id="2370">
     <td><a href="https://cplusplus.github.io/CWG/issues/2370.html">2370</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.unqual">basic.lookup.unqual</a>]</td>
     <td>CD6</td>
     <td><TT>friend</TT> declarations of namespace-scope functions</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2371">
     <td><a href="https://cplusplus.github.io/CWG/issues/2371.html">2371</a></td>
+    <td>[<a href="https://wg21.link/basic.def">basic.def</a>]</td>
     <td>CD5</td>
     <td>Use of the English term &#8220;attributes&#8221; is confusing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2372">
     <td><a href="https://cplusplus.github.io/CWG/issues/2372.html">2372</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD5</td>
     <td>Incorrect matching rules for block-scope <TT>extern</TT> declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2373">
     <td><a href="https://cplusplus.github.io/CWG/issues/2373.html">2373</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>CD5</td>
     <td>Incorrect handling of static member function templates in partial ordering</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2374">
     <td><a href="https://cplusplus.github.io/CWG/issues/2374.html">2374</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++20</td>
     <td>Overly permissive specification of <TT>enum</TT> direct-list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2375">
     <td><a href="https://cplusplus.github.io/CWG/issues/2375.html">2375</a></td>
+    <td>[<a href="https://wg21.link/class.static.data">class.static.data</a>]</td>
     <td>NAD</td>
     <td>Multiple redeclarations of <TT>constexpr</TT> static data members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2376">
     <td><a href="https://cplusplus.github.io/CWG/issues/2376.html">2376</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD5</td>
     <td>Class template argument deduction with array declarator</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2377">
     <td><a href="https://cplusplus.github.io/CWG/issues/2377.html">2377</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>NAD</td>
     <td>Explicit copy constructor vs function viability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2378">
     <td><a href="https://cplusplus.github.io/CWG/issues/2378.html">2378</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++20</td>
     <td>Inconsistent grammar for reference <I>init-capture</I> of pack</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2379">
     <td><a href="https://cplusplus.github.io/CWG/issues/2379.html">2379</a></td>
+    <td>[<a href="https://wg21.link/temp.friend">temp.friend</a>]</td>
     <td>CD5</td>
     <td>Missing prohibition against <TT>constexpr</TT> in <TT>friend</TT> declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2380">
     <td><a href="https://cplusplus.github.io/CWG/issues/2380.html">2380</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD5</td>
     <td><I>capture-default</I> makes too many references odr-usable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2381">
     <td><a href="https://cplusplus.github.io/CWG/issues/2381.html">2381</a></td>
+    <td>[<a href="https://wg21.link/expr.type">expr.type</a>]</td>
     <td>CD5</td>
     <td>Composite pointer type of pointers to plain and noexcept member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2382">
     <td><a href="https://cplusplus.github.io/CWG/issues/2382.html">2382</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD5</td>
     <td>Array allocation overhead for non-allocating placement <TT>new</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2383">
     <td><a href="https://cplusplus.github.io/CWG/issues/2383.html">2383</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>NAD</td>
     <td>Variadic member functions of variadic class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2384">
     <td><a href="https://cplusplus.github.io/CWG/issues/2384.html">2384</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>CD5</td>
     <td>Conversion function templates and qualification conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2385">
     <td><a href="https://cplusplus.github.io/CWG/issues/2385.html">2385</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD5</td>
     <td>Lookup for <I>conversion-function-id</I>s</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2386">
     <td><a href="https://cplusplus.github.io/CWG/issues/2386.html">2386</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD5</td>
     <td><T>tuple_size</T> requirements for structured binding</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2387">
     <td><a href="https://cplusplus.github.io/CWG/issues/2387.html">2387</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD5</td>
     <td>Linkage of const-qualified variable template</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2388">
     <td><a href="https://cplusplus.github.io/CWG/issues/2388.html">2388</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>NAD</td>
     <td>Applicability of <I>contract-attribute-specifier</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2389">
     <td><a href="https://cplusplus.github.io/CWG/issues/2389.html">2389</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>CD6</td>
     <td>Agreement of deduced and explicitly-specified variable types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2390">
     <td><a href="https://cplusplus.github.io/CWG/issues/2390.html">2390</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD5</td>
     <td>Is the argument of <TT>__has_cpp_attribute</TT> macro-expanded?</td>
     <td class="full" align="center">Clang 14</td>
   </tr>
   <tr id="2391">
     <td><a href="https://cplusplus.github.io/CWG/issues/2391.html">2391</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>dup</td>
     <td>Additional template parameters following pack expansion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2392">
     <td><a href="https://cplusplus.github.io/CWG/issues/2392.html">2392</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td><I>new-expression</I> size check and constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2393">
     <td><a href="https://cplusplus.github.io/CWG/issues/2393.html">2393</a></td>
+    <td>[<a href="https://wg21.link/expr.pseudo">expr.pseudo</a>]</td>
     <td>NAD</td>
     <td>Pseudo-destructors and object lifetime</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2394">
     <td><a href="https://cplusplus.github.io/CWG/issues/2394.html">2394</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>CD5</td>
     <td>Const-default-constructible for members</td>
     <td class="full" align="center">Clang 15</td>
   </tr>
   <tr class="open" id="2395">
     <td><a href="https://cplusplus.github.io/CWG/issues/2395.html">2395</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>drafting</td>
     <td>Parameters following a pack expansion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2396">
     <td><a href="https://cplusplus.github.io/CWG/issues/2396.html">2396</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD6</td>
     <td>Lookup of names in complex <I>conversion-type-id</I>s</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2397">
     <td><a href="https://cplusplus.github.io/CWG/issues/2397.html">2397</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>CD6</td>
     <td><TT>auto</TT> specifier for pointers and references to arrays</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2398">
     <td><a href="https://cplusplus.github.io/CWG/issues/2398.html">2398</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>drafting</td>
     <td>Template template parameter matching and deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2399">
     <td><a href="https://cplusplus.github.io/CWG/issues/2399.html">2399</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD5</td>
     <td>Unclear referent of &#8220;expression&#8221; in <I>assignment-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2400">
     <td><a href="https://cplusplus.github.io/CWG/issues/2400.html">2400</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Constexpr virtual functions and temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2401">
     <td><a href="https://cplusplus.github.io/CWG/issues/2401.html">2401</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>C++20</td>
     <td>Array decay vs prohibition of subobject non-type arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2402">
     <td><a href="https://cplusplus.github.io/CWG/issues/2402.html">2402</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>When is the restriction to a single <I>c-char</I> in a Unicode literal enforced?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2403">
     <td><a href="https://cplusplus.github.io/CWG/issues/2403.html">2403</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>drafting</td>
     <td>Temporary materialization and base/member initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2404">
     <td><a href="https://cplusplus.github.io/CWG/issues/2404.html">2404</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>CD5</td>
     <td><TT>[[no_unique_address]]</TT> and allocation order</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2405">
     <td><a href="https://cplusplus.github.io/CWG/issues/2405.html">2405</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD6</td>
     <td>Additional type-dependent expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2406">
     <td><a href="https://cplusplus.github.io/CWG/issues/2406.html">2406</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.fallthrough">dcl.attr.fallthrough</a>]</td>
     <td>CD5</td>
     <td><TT>[[fallthrough]]</TT> attribute and iteration statements</td>
     <td class="full" align="center">Clang 5</td>
   </tr>
   <tr id="2407">
     <td><a href="https://cplusplus.github.io/CWG/issues/2407.html">2407</a></td>
+    <td>[<a href="https://wg21.link/diff">diff</a>]</td>
     <td>C++23</td>
     <td>Missing entry in Annex C for defaulted comparison operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2408">
     <td><a href="https://cplusplus.github.io/CWG/issues/2408.html">2408</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>NAD</td>
     <td>Temporaries and previously-initialized elements in aggregate initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2409">
     <td><a href="https://cplusplus.github.io/CWG/issues/2409.html">2409</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>drafting</td>
     <td>Explicit specializations of constexpr static data members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2410">
     <td><a href="https://cplusplus.github.io/CWG/issues/2410.html">2410</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++23</td>
     <td>Implicit calls of immediate functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2411">
     <td><a href="https://cplusplus.github.io/CWG/issues/2411.html">2411</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>C++20</td>
     <td>Comparison of pointers to members in template non-type arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2412">
     <td><a href="https://cplusplus.github.io/CWG/issues/2412.html">2412</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>review</td>
     <td>SFINAE vs undeduced placeholder type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2413">
     <td><a href="https://cplusplus.github.io/CWG/issues/2413.html">2413</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD6</td>
     <td><TT>typename</TT> in <I>conversion-function-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2414">
     <td><a href="https://cplusplus.github.io/CWG/issues/2414.html">2414</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>C++20</td>
     <td>Unclear results if both member and friend <TT>operator&lt;=&gt;</TT> are declared</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2415">
     <td><a href="https://cplusplus.github.io/CWG/issues/2415.html">2415</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>NAD</td>
     <td><I>using-declaration</I>s vs copy assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2416">
     <td><a href="https://cplusplus.github.io/CWG/issues/2416.html">2416</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++20</td>
     <td>Explicit specializations vs <TT>constexpr</TT> and <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2417">
     <td><a href="https://cplusplus.github.io/CWG/issues/2417.html">2417</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>open</td>
     <td>Explicit instantiation and exception specifications</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2418">
     <td><a href="https://cplusplus.github.io/CWG/issues/2418.html">2418</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD5</td>
     <td>Missing cases in definition of &#8220;usable in constant expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2419">
     <td><a href="https://cplusplus.github.io/CWG/issues/2419.html">2419</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>C++20</td>
     <td>Loss of generality treating pointers to objects as one-element arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2420">
     <td><a href="https://cplusplus.github.io/CWG/issues/2420.html">2420</a></td>
+    <td>[<a href="https://wg21.link/except.spec">except.spec</a>]</td>
     <td>dup</td>
     <td>Exception specifications in explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2421">
     <td><a href="https://cplusplus.github.io/CWG/issues/2421.html">2421</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicit instantiation of constrained member functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2422">
     <td><a href="https://cplusplus.github.io/CWG/issues/2422.html">2422</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>C++20</td>
     <td>Incorrect grammar for <I>deduction-guide</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2423">
     <td><a href="https://cplusplus.github.io/CWG/issues/2423.html">2423</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>NAD</td>
     <td>Typedefs, names, and entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2424">
     <td><a href="https://cplusplus.github.io/CWG/issues/2424.html">2424</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++20</td>
     <td><TT>constexpr</TT> initialization requirements for variant members</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2425">
     <td><a href="https://cplusplus.github.io/CWG/issues/2425.html">2425</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>open</td>
     <td>Confusing wording for deduction from a type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2426">
     <td><a href="https://cplusplus.github.io/CWG/issues/2426.html">2426</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++20</td>
     <td>Reference to destructor that cannot be invoked</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2427">
     <td><a href="https://cplusplus.github.io/CWG/issues/2427.html">2427</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>C++20</td>
     <td>Deprecation of volatile operands and unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2428">
     <td><a href="https://cplusplus.github.io/CWG/issues/2428.html">2428</a></td>
+    <td>[<a href="https://wg21.link/temp.concept">temp.concept</a>]</td>
     <td>C++23</td>
     <td>Deprecating a concept</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2429">
     <td><a href="https://cplusplus.github.io/CWG/issues/2429.html">2429</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>C++20</td>
     <td>Initialization of <TT>thread_local</TT> variables referenced by lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2430">
     <td><a href="https://cplusplus.github.io/CWG/issues/2430.html">2430</a></td>
+    <td>[<a href="https://wg21.link/class.mem">class.mem</a>]</td>
     <td>C++20</td>
     <td>Completeness of return and parameter types of member functions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2431">
     <td><a href="https://cplusplus.github.io/CWG/issues/2431.html">2431</a></td>
+    <td>[<a href="https://wg21.link/basic.exec">basic.exec</a>]</td>
     <td>C++20</td>
     <td>Full-expressions and temporaries bound to references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2432">
     <td><a href="https://cplusplus.github.io/CWG/issues/2432.html">2432</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++20</td>
     <td>Return types for defaulted <TT>&lt;=&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2433">
     <td><a href="https://cplusplus.github.io/CWG/issues/2433.html">2433</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++20</td>
     <td>Variable templates in the ODR</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2434">
     <td><a href="https://cplusplus.github.io/CWG/issues/2434.html">2434</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>review</td>
     <td>Mandatory copy elision vs non-class objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2435">
     <td><a href="https://cplusplus.github.io/CWG/issues/2435.html">2435</a></td>
+    <td>[<a href="https://wg21.link/temp.spec">temp.spec</a>]</td>
     <td>open</td>
     <td>Alias template specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2436">
     <td><a href="https://cplusplus.github.io/CWG/issues/2436.html">2436</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++20</td>
     <td>Copy semantics of coroutine parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2437">
     <td><a href="https://cplusplus.github.io/CWG/issues/2437.html">2437</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++20</td>
     <td>Conversion of <TT>std::strong_ordering</TT> in a defaulted <TT>operator&lt;=&gt;</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2438">
     <td><a href="https://cplusplus.github.io/CWG/issues/2438.html">2438</a></td>
+    <td>[<a href="https://wg21.link/conv.qual">conv.qual</a>]</td>
     <td>open</td>
     <td>Problems in the specification of qualification conversions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2439">
     <td><a href="https://cplusplus.github.io/CWG/issues/2439.html">2439</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++20</td>
     <td>Undefined term in definition of &#8220;usable in constant expressions&#8221;</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2440">
     <td><a href="https://cplusplus.github.io/CWG/issues/2440.html">2440</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Allocation in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2441">
     <td><a href="https://cplusplus.github.io/CWG/issues/2441.html">2441</a></td>
+    <td>[<a href="https://wg21.link/dcl.inline">dcl.inline</a>]</td>
     <td>C++20</td>
     <td>Inline function parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2442">
     <td><a href="https://cplusplus.github.io/CWG/issues/2442.html">2442</a></td>
+    <td>[<a href="https://wg21.link/over.match.viable">over.match.viable</a>]</td>
     <td>C++20</td>
     <td>Incorrect requirement for default arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2443">
     <td><a href="https://cplusplus.github.io/CWG/issues/2443.html">2443</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>C++23</td>
     <td>Meaningless template exports</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2444">
     <td><a href="https://cplusplus.github.io/CWG/issues/2444.html">2444</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>drafting</td>
     <td>Constant expressions in initialization odr-use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2445">
     <td><a href="https://cplusplus.github.io/CWG/issues/2445.html">2445</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>C++20</td>
     <td>Partial ordering with rewritten candidates</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2446">
     <td><a href="https://cplusplus.github.io/CWG/issues/2446.html">2446</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>C++20</td>
     <td>Questionable type-dependency of <I>concept-id</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2447">
     <td><a href="https://cplusplus.github.io/CWG/issues/2447.html">2447</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto">dcl.spec.auto</a>]</td>
     <td>C++20</td>
     <td>Unintended description of abbreviated function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2448">
     <td><a href="https://cplusplus.github.io/CWG/issues/2448.html">2448</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of arithmetic types and deprecation of volatile</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2449">
     <td><a href="https://cplusplus.github.io/CWG/issues/2449.html">2449</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>extension</td>
     <td>Thunks as an implementation technique for pointers to virtual functions</td>
     <td align="center">Extension</td>
   </tr>
   <tr id="2450">
     <td><a href="https://cplusplus.github.io/CWG/issues/2450.html">2450</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>CD7</td>
     <td><I>braced-init-list</I> as a <I>template-argument</I></td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2451">
     <td><a href="https://cplusplus.github.io/CWG/issues/2451.html">2451</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++23</td>
     <td><I>promise</I><TT>.unhandled_exception()</TT> and final suspend point</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2452">
     <td><a href="https://cplusplus.github.io/CWG/issues/2452.html">2452</a></td>
+    <td>[<a href="https://wg21.link/stmt.return.coroutine">stmt.return.coroutine</a>]</td>
     <td>CD6</td>
     <td>Flowing off the end of a coroutine</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2453">
     <td><a href="https://cplusplus.github.io/CWG/issues/2453.html">2453</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>NAD</td>
     <td>Deduced return types and coroutine lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2454">
     <td><a href="https://cplusplus.github.io/CWG/issues/2454.html">2454</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>NAD</td>
     <td>Tail recursion and coroutine symmetric transfer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2455">
     <td><a href="https://cplusplus.github.io/CWG/issues/2455.html">2455</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD6</td>
     <td>Concatenation of string literals vs translation phases 5 and 6</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2456">
     <td><a href="https://cplusplus.github.io/CWG/issues/2456.html">2456</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Viable user-defined conversions in converted constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2457">
     <td><a href="https://cplusplus.github.io/CWG/issues/2457.html">2457</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD6</td>
     <td>Unexpanded parameter packs don't make a function type dependent</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2458">
     <td><a href="https://cplusplus.github.io/CWG/issues/2458.html">2458</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Value category of expressions denoting non-static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2459">
     <td><a href="https://cplusplus.github.io/CWG/issues/2459.html">2459</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.nontype">temp.arg.nontype</a>]</td>
     <td>CD7</td>
     <td>Template parameter initialization</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2460">
     <td><a href="https://cplusplus.github.io/CWG/issues/2460.html">2460</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>CD6</td>
     <td>C language linkage and constrained non-template friends</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2461">
     <td><a href="https://cplusplus.github.io/CWG/issues/2461.html">2461</a></td>
+    <td>[<a href="https://wg21.link/temp.res">temp.res</a>]</td>
     <td>CD6</td>
     <td>Diagnosing non-<TT>bool</TT> type constraints</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2462">
     <td><a href="https://cplusplus.github.io/CWG/issues/2462.html">2462</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>open</td>
     <td>Problems with the omission of the <TT>typename</TT> keyword</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2463">
     <td><a href="https://cplusplus.github.io/CWG/issues/2463.html">2463</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Trivial copyability and unions with non-trivial members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2464">
     <td><a href="https://cplusplus.github.io/CWG/issues/2464.html">2464</a></td>
+    <td>[<a href="https://wg21.link/ptr.launder">ptr.launder</a>]</td>
     <td>CD6</td>
     <td>Constexpr launder and unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2465">
     <td><a href="https://cplusplus.github.io/CWG/issues/2465.html">2465</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD6</td>
     <td>Coroutine parameters passed to a promise constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2466">
     <td><a href="https://cplusplus.github.io/CWG/issues/2466.html">2466</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>CD6</td>
     <td><TT>co_await</TT> should be a single evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2467">
     <td><a href="https://cplusplus.github.io/CWG/issues/2467.html">2467</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>drafting</td>
     <td>CTAD for alias templates and the deducible check</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2468">
     <td><a href="https://cplusplus.github.io/CWG/issues/2468.html">2468</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>open</td>
     <td>Omission of the <TT>typename</TT> keyword in a member template parameter list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2469">
     <td><a href="https://cplusplus.github.io/CWG/issues/2469.html">2469</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>drafting</td>
     <td>Implicit object creation vs constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2470">
     <td><a href="https://cplusplus.github.io/CWG/issues/2470.html">2470</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD6</td>
     <td>Multiple array objects providing storage for one object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2471">
     <td><a href="https://cplusplus.github.io/CWG/issues/2471.html">2471</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>drafting</td>
     <td>Nested class template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2472">
     <td><a href="https://cplusplus.github.io/CWG/issues/2472.html">2472</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>NAD</td>
     <td>Value categories in <I>await-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2473">
     <td><a href="https://cplusplus.github.io/CWG/issues/2473.html">2473</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.dtor">expr.prim.id.dtor</a>]</td>
     <td>open</td>
     <td>Parentheses in pseudo-destructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2474">
     <td><a href="https://cplusplus.github.io/CWG/issues/2474.html">2474</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification and deletion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2475">
     <td><a href="https://cplusplus.github.io/CWG/issues/2475.html">2475</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>C++23</td>
     <td>Object declarations of type <I>cv</I> <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2476">
     <td><a href="https://cplusplus.github.io/CWG/issues/2476.html">2476</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>CD7</td>
     <td><I>placeholder-type-specifier</I>s and function declarators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2477">
     <td><a href="https://cplusplus.github.io/CWG/issues/2477.html">2477</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>CD6</td>
     <td>Defaulted vs deleted copy constructors/assignment operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2478">
     <td><a href="https://cplusplus.github.io/CWG/issues/2478.html">2478</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++23</td>
     <td>Properties of explicit specializations of implicitly-instantiated class templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2479">
     <td><a href="https://cplusplus.github.io/CWG/issues/2479.html">2479</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD6</td>
     <td>Missing specifications for <TT>consteval</TT> and <TT>constinit</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2480">
     <td><a href="https://cplusplus.github.io/CWG/issues/2480.html">2480</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>drafting</td>
     <td>Lookup for enumerators in modules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2481">
     <td><a href="https://cplusplus.github.io/CWG/issues/2481.html">2481</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD6</td>
     <td>Cv-qualification of temporary to which a reference is bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2482">
     <td><a href="https://cplusplus.github.io/CWG/issues/2482.html">2482</a></td>
+    <td>[<a href="https://wg21.link/bit.cast">bit.cast</a>]</td>
     <td>CD6</td>
     <td><TT>bit_cast</TT> and indeterminate values</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2483">
     <td><a href="https://cplusplus.github.io/CWG/issues/2483.html">2483</a></td>
+    <td>[<a href="https://wg21.link/dcl.link">dcl.link</a>]</td>
     <td>C++23</td>
     <td>Language linkage of static member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2484">
     <td><a href="https://cplusplus.github.io/CWG/issues/2484.html">2484</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD6</td>
     <td><TT>char8_t</TT> and <TT>char16_t</TT> in integral promotions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2485">
     <td><a href="https://cplusplus.github.io/CWG/issues/2485.html">2485</a></td>
+    <td>[<a href="https://wg21.link/conv.prom">conv.prom</a>]</td>
     <td>CD7</td>
     <td>Bit-fields in integral promotions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2486">
     <td><a href="https://cplusplus.github.io/CWG/issues/2486.html">2486</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD6</td>
     <td>Call to <TT>noexcept</TT> function via <TT>noexcept(false)</TT> pointer/lvalue</td>
     <td class="full" align="center">Clang 4 (C++17 onwards)</td>
   </tr>
   <tr class="open" id="2487">
     <td><a href="https://cplusplus.github.io/CWG/issues/2487.html">2487</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>drafting</td>
     <td>Type dependence of function-style cast to incomplete array type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2488">
     <td><a href="https://cplusplus.github.io/CWG/issues/2488.html">2488</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Overloading virtual functions and functions with trailing <I>requires-clause</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2489">
     <td><a href="https://cplusplus.github.io/CWG/issues/2489.html">2489</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>C++23</td>
     <td>Storage provided by array of <TT>char</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2490">
     <td><a href="https://cplusplus.github.io/CWG/issues/2490.html">2490</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD6</td>
     <td>Restrictions on destruction in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2491">
     <td><a href="https://cplusplus.github.io/CWG/issues/2491.html">2491</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD6</td>
     <td>Export of typedef after its first declaration</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2492">
     <td><a href="https://cplusplus.github.io/CWG/issues/2492.html">2492</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Comparing user-defined conversion sequences in list-initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2493">
     <td><a href="https://cplusplus.github.io/CWG/issues/2493.html">2493</a></td>
+    <td>[<a href="https://wg21.link/dcl.spec.auto.general">dcl.spec.auto.general</a>]</td>
     <td>dup</td>
     <td><TT>auto</TT> as a <I>conversion-type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2494">
     <td><a href="https://cplusplus.github.io/CWG/issues/2494.html">2494</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD6</td>
     <td>Multiple definitions of non-odr-used entities</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2495">
     <td><a href="https://cplusplus.github.io/CWG/issues/2495.html">2495</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Glvalue result of a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2496">
     <td><a href="https://cplusplus.github.io/CWG/issues/2496.html">2496</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>CD6</td>
     <td><I>ref-qualifier</I>s and virtual overriding</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr class="open" id="2497">
     <td><a href="https://cplusplus.github.io/CWG/issues/2497.html">2497</a></td>
+    <td>[<a href="https://wg21.link/temp.point">temp.point</a>]</td>
     <td>drafting</td>
     <td>Points of instantiation for constexpr function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2498">
     <td><a href="https://cplusplus.github.io/CWG/issues/2498.html">2498</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Partial specialization failure and the immediate context</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2499">
     <td><a href="https://cplusplus.github.io/CWG/issues/2499.html">2499</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>CD6</td>
     <td>Inconsistency in definition of pointer-interconvertibility</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2500">
     <td><a href="https://cplusplus.github.io/CWG/issues/2500.html">2500</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>extension</td>
     <td><TT>noexcept(false)</TT> functions and <TT>noexcept</TT> expressions</td>
     <td align="center">Extension</td>
   </tr>
   <tr class="open" id="2501">
     <td><a href="https://cplusplus.github.io/CWG/issues/2501.html">2501</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>drafting</td>
     <td>Explicit instantiation and trailing <I>requires-clause</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2502">
     <td><a href="https://cplusplus.github.io/CWG/issues/2502.html">2502</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD6</td>
     <td>Unintended declaration conflicts in nested statement scopes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2503">
     <td><a href="https://cplusplus.github.io/CWG/issues/2503.html">2503</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id">expr.prim.id</a>]</td>
     <td>drafting</td>
     <td>Unclear relationship among name, qualified name, and unqualified name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2504">
     <td><a href="https://cplusplus.github.io/CWG/issues/2504.html">2504</a></td>
+    <td>[<a href="https://wg21.link/class.inhctor.init">class.inhctor.init</a>]</td>
     <td>CD7</td>
     <td>Inheriting constructors from virtual base classes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2505">
     <td><a href="https://cplusplus.github.io/CWG/issues/2505.html">2505</a></td>
+    <td>[<a href="https://wg21.link/namespace.unnamed">namespace.unnamed</a>]</td>
     <td>drafting</td>
     <td>Nested unnamed namespace of inline unnamed namespace</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2506">
     <td><a href="https://cplusplus.github.io/CWG/issues/2506.html">2506</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD6</td>
     <td>Structured bindings and array cv-qualifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2507">
     <td><a href="https://cplusplus.github.io/CWG/issues/2507.html">2507</a></td>
+    <td>[<a href="https://wg21.link/over.oper.general">over.oper.general</a>]</td>
     <td>CD6</td>
     <td>Default arguments for <TT>operator[]</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2508">
     <td><a href="https://cplusplus.github.io/CWG/issues/2508.html">2508</a></td>
+    <td>[<a href="https://wg21.link/temp.local">temp.local</a>]</td>
     <td>C++23</td>
     <td>Restrictions on uses of template parameter names</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2509">
     <td><a href="https://cplusplus.github.io/CWG/issues/2509.html">2509</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.general">expr.prim.lambda.general</a>]</td>
     <td>CD6</td>
     <td><I>decl-specifier-seq</I> in <I>lambda-specifiers</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2510">
     <td><a href="https://cplusplus.github.io/CWG/issues/2510.html">2510</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>NAD</td>
     <td><I>noexcept-specifier</I> of friend function vs class completeness</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2511">
     <td><a href="https://cplusplus.github.io/CWG/issues/2511.html">2511</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>CD6</td>
     <td>cv-qualified bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2512">
     <td><a href="https://cplusplus.github.io/CWG/issues/2512.html">2512</a></td>
+    <td>[<a href="https://wg21.link/expr.typeid">expr.typeid</a>]</td>
     <td>NAD</td>
     <td><TT>typeid</TT> and incomplete class types</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr class="open" id="2513">
     <td><a href="https://cplusplus.github.io/CWG/issues/2513.html">2513</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>open</td>
     <td>Ambiguity with <I>requires-clause</I> and <I>operator-function-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2514">
     <td><a href="https://cplusplus.github.io/CWG/issues/2514.html">2514</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Modifying const subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2515">
     <td><a href="https://cplusplus.github.io/CWG/issues/2515.html">2515</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Result of a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2516">
     <td><a href="https://cplusplus.github.io/CWG/issues/2516.html">2516</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.pdecl">basic.scope.pdecl</a>]</td>
     <td>C++23</td>
     <td>Locus of <I>enum-specifier</I> or <I>opaque-enum-declaration</I></td>
     <td class="full" align="center">Clang 3.0</td>
   </tr>
   <tr id="2517">
     <td><a href="https://cplusplus.github.io/CWG/issues/2517.html">2517</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.nested">expr.prim.req.nested</a>]</td>
     <td>C++23</td>
     <td>Useless restriction on use of parameter in <I>constraint-expression</I></td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2518">
     <td><a href="https://cplusplus.github.io/CWG/issues/2518.html">2518</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>C++23</td>
     <td>Conformance requirements and <TT>#error</TT>/<TT>#warning</TT></td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2519">
     <td><a href="https://cplusplus.github.io/CWG/issues/2519.html">2519</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>CD7</td>
     <td>Object representation of a bit-field</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2520">
     <td><a href="https://cplusplus.github.io/CWG/issues/2520.html">2520</a></td>
+    <td>[<a href="https://wg21.link/defns.signature.templ">defns.signature.templ</a>]</td>
     <td>C++23</td>
     <td>Template signature and default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2521">
     <td><a href="https://cplusplus.github.io/CWG/issues/2521.html">2521</a></td>
+    <td>[<a href="https://wg21.link/over.literal">over.literal</a>]</td>
     <td>C++23</td>
     <td>User-defined literals and reserved identifiers</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2522">
     <td><a href="https://cplusplus.github.io/CWG/issues/2522.html">2522</a></td>
+    <td>[<a href="https://wg21.link/cpp.concat">cpp.concat</a>]</td>
     <td>open</td>
     <td>Removing placemarker tokens and retention of whitespace</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2523">
     <td><a href="https://cplusplus.github.io/CWG/issues/2523.html">2523</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Undefined behavior via omitted destructor call in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2524">
     <td><a href="https://cplusplus.github.io/CWG/issues/2524.html">2524</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>NAD</td>
     <td>Distinguishing user-defined conversion sequences by <I>ref-qualifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2525">
     <td><a href="https://cplusplus.github.io/CWG/issues/2525.html">2525</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Incorrect definition of implicit conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2526">
     <td><a href="https://cplusplus.github.io/CWG/issues/2526.html">2526</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>C++23</td>
     <td>Relational comparison of <TT>void*</TT> pointers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2527">
     <td><a href="https://cplusplus.github.io/CWG/issues/2527.html">2527</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.nouniqueaddr">dcl.attr.nouniqueaddr</a>]</td>
     <td>NAD</td>
     <td>Non-class potentially-overlapping objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2528">
     <td><a href="https://cplusplus.github.io/CWG/issues/2528.html">2528</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>C++23</td>
     <td>Three-way comparison and the usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2529">
     <td><a href="https://cplusplus.github.io/CWG/issues/2529.html">2529</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Constant destruction of constexpr references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2530">
     <td><a href="https://cplusplus.github.io/CWG/issues/2530.html">2530</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td>Multiple definitions of enumerators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2531">
     <td><a href="https://cplusplus.github.io/CWG/issues/2531.html">2531</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>CD7</td>
     <td>Static data members redeclared as constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2532">
     <td><a href="https://cplusplus.github.io/CWG/issues/2532.html">2532</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Kind of pointer value returned by <TT>new T[0]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2533">
     <td><a href="https://cplusplus.github.io/CWG/issues/2533.html">2533</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD7</td>
     <td>Storage duration of implicitly created objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2534">
     <td><a href="https://cplusplus.github.io/CWG/issues/2534.html">2534</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Value category of pseudo-destructor expression</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2535">
     <td><a href="https://cplusplus.github.io/CWG/issues/2535.html">2535</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD6</td>
     <td>Type punning in class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2536">
     <td><a href="https://cplusplus.github.io/CWG/issues/2536.html">2536</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Partially initialized variables during constant initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2537">
     <td><a href="https://cplusplus.github.io/CWG/issues/2537.html">2537</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>drafting</td>
     <td>Overbroad grammar for <I>parameter-declaration</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2538">
     <td><a href="https://cplusplus.github.io/CWG/issues/2538.html">2538</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++23</td>
     <td>Can standard attributes be syntactically ignored?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2539">
     <td><a href="https://cplusplus.github.io/CWG/issues/2539.html">2539</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>C++23</td>
     <td>Three-way comparison requiring strong ordering for floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2540">
     <td><a href="https://cplusplus.github.io/CWG/issues/2540.html">2540</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>CD6</td>
     <td>Unspecified interpretation of <I>numeric-escape-sequence</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2541">
     <td><a href="https://cplusplus.github.io/CWG/issues/2541.html">2541</a></td>
+    <td>[<a href="https://wg21.link/module.unit">module.unit</a>]</td>
     <td>open</td>
     <td>Linkage specifications, module purview, and module attachment</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2542">
     <td><a href="https://cplusplus.github.io/CWG/issues/2542.html">2542</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Is a closure type a structural type?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2543">
     <td><a href="https://cplusplus.github.io/CWG/issues/2543.html">2543</a></td>
+    <td>[<a href="https://wg21.link/dcl.constinit">dcl.constinit</a>]</td>
     <td>C++23</td>
     <td><TT>constinit</TT> and optimized dynamic initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2544">
     <td><a href="https://cplusplus.github.io/CWG/issues/2544.html">2544</a></td>
+    <td>[<a href="https://wg21.link/basic.compound">basic.compound</a>]</td>
     <td>open</td>
     <td>Address of past-the-end of a potentially-overlapping subobject</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2545">
     <td><a href="https://cplusplus.github.io/CWG/issues/2545.html">2545</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Transparently replacing objects in constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2546">
     <td><a href="https://cplusplus.github.io/CWG/issues/2546.html">2546</a></td>
+    <td>[<a href="https://wg21.link/class.compare.secondary">class.compare.secondary</a>]</td>
     <td>CD7</td>
     <td>Defaulted secondary comparison operators defined as deleted</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2547">
     <td><a href="https://cplusplus.github.io/CWG/issues/2547.html">2547</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>Defaulted comparison operator function for non-classes</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2548">
     <td><a href="https://cplusplus.github.io/CWG/issues/2548.html">2548</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>NAD</td>
     <td>Array prvalues and additive operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2549">
     <td><a href="https://cplusplus.github.io/CWG/issues/2549.html">2549</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD7</td>
     <td>Implicitly moving the operand of a <I>throw-expression</I> in unevaluated contexts</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2550">
     <td><a href="https://cplusplus.github.io/CWG/issues/2550.html">2550</a></td>
+    <td>[<a href="https://wg21.link/dcl.ref">dcl.ref</a>]</td>
     <td>CD7</td>
     <td>Type "reference to <I>cv</I> <TT>void</TT>" outside of a declarator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2551">
     <td><a href="https://cplusplus.github.io/CWG/issues/2551.html">2551</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>"Refers to allocated storage" has no meaning</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2552">
     <td><a href="https://cplusplus.github.io/CWG/issues/2552.html">2552</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Constant evaluation of non-defining variable declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2553">
     <td><a href="https://cplusplus.github.io/CWG/issues/2553.html">2553</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>review</td>
     <td>Restrictions on explicit object member functions</td>
     <td align="center">
@@ -15161,6 +17672,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2554">
     <td><a href="https://cplusplus.github.io/CWG/issues/2554.html">2554</a></td>
+    <td>[<a href="https://wg21.link/class.virtual">class.virtual</a>]</td>
     <td>review</td>
     <td>Overriding virtual functions, also with explicit object parameters</td>
     <td align="center">
@@ -15171,66 +17683,77 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2555">
     <td><a href="https://cplusplus.github.io/CWG/issues/2555.html">2555</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>tentatively ready</td>
     <td>Ineffective redeclaration prevention for <I>using-declarator</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2556">
     <td><a href="https://cplusplus.github.io/CWG/issues/2556.html">2556</a></td>
+    <td>[<a href="https://wg21.link/stmt.return.coroutine">stmt.return.coroutine</a>]</td>
     <td>CD7</td>
     <td>Unusable <TT>promise::return_void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2557">
     <td><a href="https://cplusplus.github.io/CWG/issues/2557.html">2557</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>review</td>
     <td>Class member access referring to an unrelated class</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2558">
     <td><a href="https://cplusplus.github.io/CWG/issues/2558.html">2558</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Uninitialized subobjects as a result of an immediate invocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2559">
     <td><a href="https://cplusplus.github.io/CWG/issues/2559.html">2559</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Defaulted consteval functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2560">
     <td><a href="https://cplusplus.github.io/CWG/issues/2560.html">2560</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>CD7</td>
     <td>Parameter type determination in a <I>requirement-parameter-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2561">
     <td><a href="https://cplusplus.github.io/CWG/issues/2561.html">2561</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Conversion to function pointer for lambda with explicit object parameter</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr class="open" id="2562">
     <td><a href="https://cplusplus.github.io/CWG/issues/2562.html">2562</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Exceptions thrown during coroutine startup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2563">
     <td><a href="https://cplusplus.github.io/CWG/issues/2563.html">2563</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>review</td>
     <td>Initialization of coroutine result object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2564">
     <td><a href="https://cplusplus.github.io/CWG/issues/2564.html">2564</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>drafting</td>
     <td>Conversion to function pointer with an explicit object parameter</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2565">
     <td><a href="https://cplusplus.github.io/CWG/issues/2565.html">2565</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>open</td>
     <td>Invalid types in the <I>parameter-declaration-clause</I> of a <I>requires-expression</I></td>
     <td align="center">
@@ -15241,1231 +17764,1435 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="2566">
     <td><a href="https://cplusplus.github.io/CWG/issues/2566.html">2566</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>review</td>
     <td>Matching deallocation for uncaught exception</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2567">
     <td><a href="https://cplusplus.github.io/CWG/issues/2567.html">2567</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>NAD</td>
     <td>Operator lookup ambiguity</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2568">
     <td><a href="https://cplusplus.github.io/CWG/issues/2568.html">2568</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>CD7</td>
     <td>Access checking during synthesis of defaulted comparison operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2569">
     <td><a href="https://cplusplus.github.io/CWG/issues/2569.html">2569</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>CD6</td>
     <td>Use of <code>decltype(capture)</code> in a lambda's <I>parameter-declaration-clause</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2570">
     <td><a href="https://cplusplus.github.io/CWG/issues/2570.html">2570</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>Clarify constexpr for defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2571">
     <td><a href="https://cplusplus.github.io/CWG/issues/2571.html">2571</a></td>
+    <td>[<a href="https://wg21.link/expr.sub">expr.sub</a>]</td>
     <td>CD6</td>
     <td>Evaluation order for subscripting</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2572">
     <td><a href="https://cplusplus.github.io/CWG/issues/2572.html">2572</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>review</td>
     <td>Address of overloaded function with no target</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2573">
     <td><a href="https://cplusplus.github.io/CWG/issues/2573.html">2573</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when splicing results in a <I>universal-character-name</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2574">
     <td><a href="https://cplusplus.github.io/CWG/issues/2574.html">2574</a></td>
+    <td>[<a href="https://wg21.link/lex.pptoken">lex.pptoken</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when lexing unmatched quotes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2575">
     <td><a href="https://cplusplus.github.io/CWG/issues/2575.html">2575</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Undefined behavior when macro-replacing "defined" operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2576">
     <td><a href="https://cplusplus.github.io/CWG/issues/2576.html">2576</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>open</td>
     <td>Undefined behavior with macro-expanded <TT>#include</TT> directives</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2577">
     <td><a href="https://cplusplus.github.io/CWG/issues/2577.html">2577</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>open</td>
     <td>Undefined behavior for preprocessing directives in macro arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2578">
     <td><a href="https://cplusplus.github.io/CWG/issues/2578.html">2578</a></td>
+    <td>[<a href="https://wg21.link/cpp.stringize">cpp.stringize</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when creating an invalid string literal via stringizing</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2579">
     <td><a href="https://cplusplus.github.io/CWG/issues/2579.html">2579</a></td>
+    <td>[<a href="https://wg21.link/cpp.concat">cpp.concat</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior when token pasting does not create a preprocessing token</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2580">
     <td><a href="https://cplusplus.github.io/CWG/issues/2580.html">2580</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior with <TT>#line</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2581">
     <td><a href="https://cplusplus.github.io/CWG/issues/2581.html">2581</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>open</td>
     <td>Undefined behavior for predefined macros</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2582">
     <td><a href="https://cplusplus.github.io/CWG/issues/2582.html">2582</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>CD6</td>
     <td>Differing member lookup from nested classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2583">
     <td><a href="https://cplusplus.github.io/CWG/issues/2583.html">2583</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>C++23</td>
     <td>Common initial sequence should consider over-alignment</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2584">
     <td><a href="https://cplusplus.github.io/CWG/issues/2584.html">2584</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Equivalent types in function template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2585">
     <td><a href="https://cplusplus.github.io/CWG/issues/2585.html">2585</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD6</td>
     <td>Name lookup for coroutine allocation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2586">
     <td><a href="https://cplusplus.github.io/CWG/issues/2586.html">2586</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>CD6</td>
     <td>Explicit object parameter for assignment and comparison</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2587">
     <td><a href="https://cplusplus.github.io/CWG/issues/2587.html">2587</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>review</td>
     <td>Visible side effects and initial value of an object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2588">
     <td><a href="https://cplusplus.github.io/CWG/issues/2588.html">2588</a></td>
+    <td>[<a href="https://wg21.link/class.friend">class.friend</a>]</td>
     <td>CD7</td>
     <td>friend declarations and module linkage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2589">
     <td><a href="https://cplusplus.github.io/CWG/issues/2589.html">2589</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.atomic">temp.constr.atomic</a>]</td>
     <td>review</td>
     <td>Context of access checks during constraint satisfaction checking</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2590">
     <td><a href="https://cplusplus.github.io/CWG/issues/2590.html">2590</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>C++23</td>
     <td>Underlying type should determine size and alignment requirements of an enum</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2591">
     <td><a href="https://cplusplus.github.io/CWG/issues/2591.html">2591</a></td>
+    <td>[<a href="https://wg21.link/class.union.general">class.union.general</a>]</td>
     <td>CD7</td>
     <td>Implicit change of active union member for anonymous union in union</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2592">
     <td><a href="https://cplusplus.github.io/CWG/issues/2592.html">2592</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Missing definition for placement allocation/deallocation function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2593">
     <td><a href="https://cplusplus.github.io/CWG/issues/2593.html">2593</a></td>
+    <td>[<a href="https://wg21.link/expr.mptr.oper">expr.mptr.oper</a>]</td>
     <td>review</td>
     <td>Insufficient base class restriction for pointer-to-member expression</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2594">
     <td><a href="https://cplusplus.github.io/CWG/issues/2594.html">2594</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD6</td>
     <td>Disallowing a global function template <TT>main</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2595">
     <td><a href="https://cplusplus.github.io/CWG/issues/2595.html">2595</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>CD7</td>
     <td>"More constrained" for eligible special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2596">
     <td><a href="https://cplusplus.github.io/CWG/issues/2596.html">2596</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>drafting</td>
     <td>Instantiation of constrained non-template friends</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2597">
     <td><a href="https://cplusplus.github.io/CWG/issues/2597.html">2597</a></td>
+    <td>[<a href="https://wg21.link/module.unit">module.unit</a>]</td>
     <td>CD6</td>
     <td>Replaceable allocation and deallocation functions in the global module</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2598">
     <td><a href="https://cplusplus.github.io/CWG/issues/2598.html">2598</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>C++23</td>
     <td>Unions should not require a non-static data member of literal type</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2599">
     <td><a href="https://cplusplus.github.io/CWG/issues/2599.html">2599</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++23</td>
     <td>What does initializing a parameter include?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2600">
     <td><a href="https://cplusplus.github.io/CWG/issues/2600.html">2600</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD7</td>
     <td>Type dependency of placeholder types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2601">
     <td><a href="https://cplusplus.github.io/CWG/issues/2601.html">2601</a></td>
+    <td>[<a href="https://wg21.link/except.ctor">except.ctor</a>]</td>
     <td>C++23</td>
     <td>Tracking of created and destroyed subobjects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2602">
     <td><a href="https://cplusplus.github.io/CWG/issues/2602.html">2602</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>C++23</td>
     <td>consteval defaulted functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2603">
     <td><a href="https://cplusplus.github.io/CWG/issues/2603.html">2603</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>C++23</td>
     <td>Holistic functional equivalence for function templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2604">
     <td><a href="https://cplusplus.github.io/CWG/issues/2604.html">2604</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>C++23</td>
     <td>Attributes for an explicit specialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2605">
     <td><a href="https://cplusplus.github.io/CWG/issues/2605.html">2605</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>C++23</td>
     <td>Implicit-lifetime aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2606">
     <td><a href="https://cplusplus.github.io/CWG/issues/2606.html">2606</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD6</td>
     <td><TT>static_cast</TT> from "pointer to void" does not handle similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2607">
     <td><a href="https://cplusplus.github.io/CWG/issues/2607.html">2607</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>drafting</td>
     <td>Visibility of enumerator names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2608">
     <td><a href="https://cplusplus.github.io/CWG/issues/2608.html">2608</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.explicit">temp.arg.explicit</a>]</td>
     <td>CD6</td>
     <td>Omitting an empty template argument list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2609">
     <td><a href="https://cplusplus.github.io/CWG/issues/2609.html">2609</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>open</td>
     <td>Padding in class types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2610">
     <td><a href="https://cplusplus.github.io/CWG/issues/2610.html">2610</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++23</td>
     <td>Indirect private base classes in aggregates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2611">
     <td><a href="https://cplusplus.github.io/CWG/issues/2611.html">2611</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>C++23</td>
     <td>Missing parentheses in expansion of fold-expression could cause syntactic reinterpretation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2612">
     <td><a href="https://cplusplus.github.io/CWG/issues/2612.html">2612</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>C++23</td>
     <td>Incorrect comment in example</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2613">
     <td><a href="https://cplusplus.github.io/CWG/issues/2613.html">2613</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>C++23</td>
     <td>Incomplete definition of resumer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2614">
     <td><a href="https://cplusplus.github.io/CWG/issues/2614.html">2614</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>C++23</td>
     <td>Unspecified results for class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2615">
     <td><a href="https://cplusplus.github.io/CWG/issues/2615.html">2615</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>C++23</td>
     <td>Missing <TT>__has_cpp_attribute(assume)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2616">
     <td><a href="https://cplusplus.github.io/CWG/issues/2616.html">2616</a></td>
+    <td>[<a href="https://wg21.link/stmt">stmt</a>]</td>
     <td>C++23</td>
     <td>Imprecise restrictions on <TT>break</TT> and <TT>continue</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2617">
     <td><a href="https://cplusplus.github.io/CWG/issues/2617.html">2617</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>review</td>
     <td>Default template arguments for template members of non-template classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2618">
     <td><a href="https://cplusplus.github.io/CWG/issues/2618.html">2618</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>C++23</td>
     <td>Substitution during deduction should exclude exception specifications</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2619">
     <td><a href="https://cplusplus.github.io/CWG/issues/2619.html">2619</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.aggr">dcl.init.aggr</a>]</td>
     <td>C++23</td>
     <td>Kind of initialization for a <I>designated-initializer-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2620">
     <td><a href="https://cplusplus.github.io/CWG/issues/2620.html">2620</a></td>
+    <td>[<a href="https://wg21.link/dcl.ambig.res">dcl.ambig.res</a>]</td>
     <td>C++23</td>
     <td>Nonsensical disambiguation rule</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2621">
     <td><a href="https://cplusplus.github.io/CWG/issues/2621.html">2621</a></td>
+    <td>[<a href="https://wg21.link/enum.udecl">enum.udecl</a>]</td>
     <td>C++23</td>
     <td>Kind of lookup for <TT>using enum</TT> declarations</td>
     <td class="full-superseded" align="center">Superseded by <a href="#2877">2877</a></td>
   </tr>
   <tr id="2622">
     <td><a href="https://cplusplus.github.io/CWG/issues/2622.html">2622</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>C++23</td>
     <td>Compounding types from function and pointer-to-member types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2623">
     <td><a href="https://cplusplus.github.io/CWG/issues/2623.html">2623</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>drafting</td>
     <td>Invoking destroying <TT>operator delete</TT> for constructor failure</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2624">
     <td><a href="https://cplusplus.github.io/CWG/issues/2624.html">2624</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>C++23</td>
     <td>Array delete expression with no array cookie</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2625">
     <td><a href="https://cplusplus.github.io/CWG/issues/2625.html">2625</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>C++23</td>
     <td>Deletion of pointer to out-of-lifetime object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2626">
     <td><a href="https://cplusplus.github.io/CWG/issues/2626.html">2626</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>C++23</td>
     <td>Rephrase ones' complement using base-2 representation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2627">
     <td><a href="https://cplusplus.github.io/CWG/issues/2627.html">2627</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>C++23</td>
     <td>Bit-fields and narrowing conversions</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2628">
     <td><a href="https://cplusplus.github.io/CWG/issues/2628.html">2628</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD7</td>
     <td>Implicit deduction guides should propagate constraints</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2629">
     <td><a href="https://cplusplus.github.io/CWG/issues/2629.html">2629</a></td>
+    <td>[<a href="https://wg21.link/stmt.switch">stmt.switch</a>]</td>
     <td>C++23</td>
     <td>Variables of floating-point type as <TT>switch</TT> conditions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2630">
     <td><a href="https://cplusplus.github.io/CWG/issues/2630.html">2630</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>C++23</td>
     <td>Syntactic specification of class completeness</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2631">
     <td><a href="https://cplusplus.github.io/CWG/issues/2631.html">2631</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Immediate function evaluations in default arguments</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr class="open" id="2632">
     <td><a href="https://cplusplus.github.io/CWG/issues/2632.html">2632</a></td>
+    <td>[<a href="https://wg21.link/intro.defs">intro.defs</a>]</td>
     <td>drafting</td>
     <td>'user-declared' is not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2633">
     <td><a href="https://cplusplus.github.io/CWG/issues/2633.html">2633</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>typeid of constexpr-unknown dynamic type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2634">
     <td><a href="https://cplusplus.github.io/CWG/issues/2634.html">2634</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD7</td>
     <td>Avoid circularity in specification of scope for friend class declarations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2635">
     <td><a href="https://cplusplus.github.io/CWG/issues/2635.html">2635</a></td>
+    <td>[<a href="https://wg21.link/dcl.pre">dcl.pre</a>]</td>
     <td>C++23</td>
     <td>Constrained structured bindings</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2636">
     <td><a href="https://cplusplus.github.io/CWG/issues/2636.html">2636</a></td>
+    <td>[<a href="https://wg21.link/ub">ub</a>]</td>
     <td>C++23</td>
     <td>Update Annex E based on Unicode 15.0 UAX #31</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2637">
     <td><a href="https://cplusplus.github.io/CWG/issues/2637.html">2637</a></td>
+    <td>[<a href="https://wg21.link/class.pre">class.pre</a>]</td>
     <td>CD7</td>
     <td>Injected-class-name as a <I>simple-template-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2638">
     <td><a href="https://cplusplus.github.io/CWG/issues/2638.html">2638</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Improve the example for initializing by initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2639">
     <td><a href="https://cplusplus.github.io/CWG/issues/2639.html">2639</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>C++23</td>
     <td>new-lines after phase 1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2640">
     <td><a href="https://cplusplus.github.io/CWG/issues/2640.html">2640</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>C++23</td>
     <td>Allow more characters in an n-char sequence</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2641">
     <td><a href="https://cplusplus.github.io/CWG/issues/2641.html">2641</a></td>
+    <td>[<a href="https://wg21.link/lex.literal">lex.literal</a>]</td>
     <td>C++23</td>
     <td>Redundant specification of value category of literals</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2642">
     <td><a href="https://cplusplus.github.io/CWG/issues/2642.html">2642</a></td>
+    <td>[<a href="https://wg21.link/class.member.lookup">class.member.lookup</a>]</td>
     <td>C++23</td>
     <td>Inconsistent use of T and C</td>
     <td class="na" align="center">N/A</td>
   </tr>
   <tr id="2643">
     <td><a href="https://cplusplus.github.io/CWG/issues/2643.html">2643</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>C++23</td>
     <td>Completing a pointer to array of unknown bound</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2644">
     <td><a href="https://cplusplus.github.io/CWG/issues/2644.html">2644</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>C++23</td>
     <td>Incorrect comment in example</td>
     <td class="full" align="center">Clang 8</td>
   </tr>
   <tr id="2645">
     <td><a href="https://cplusplus.github.io/CWG/issues/2645.html">2645</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>C++23</td>
     <td>Unused term "default argument promotions"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2646">
     <td><a href="https://cplusplus.github.io/CWG/issues/2646.html">2646</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>C++23</td>
     <td>Defaulted special member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2647">
     <td><a href="https://cplusplus.github.io/CWG/issues/2647.html">2647</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Fix for "needed for constant evaluation"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2648">
     <td><a href="https://cplusplus.github.io/CWG/issues/2648.html">2648</a></td>
+    <td>[<a href="https://wg21.link/over.call">over.call</a>]</td>
     <td>C++23</td>
     <td>Correspondence of surrogate call function and conversion function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2649">
     <td><a href="https://cplusplus.github.io/CWG/issues/2649.html">2649</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>C++23</td>
     <td>Incorrect note about implicit conversion sequence</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2650">
     <td><a href="https://cplusplus.github.io/CWG/issues/2650.html">2650</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>C++23</td>
     <td>Incorrect example for ill-formed non-type template arguments</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2651">
     <td><a href="https://cplusplus.github.io/CWG/issues/2651.html">2651</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.conv">temp.deduct.conv</a>]</td>
     <td>C++23</td>
     <td>Conversion function templates and "noexcept"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2652">
     <td><a href="https://cplusplus.github.io/CWG/issues/2652.html">2652</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++23</td>
     <td>Overbroad definition of <TT>__STDCPP_BFLOAT16_T__</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2653">
     <td><a href="https://cplusplus.github.io/CWG/issues/2653.html">2653</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>C++23</td>
     <td>Can an explicit object parameter have a default argument?</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2654">
     <td><a href="https://cplusplus.github.io/CWG/issues/2654.html">2654</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>C++23</td>
     <td>Un-deprecation of compound volatile assignments</td>
     <td class="full" align="center">Clang 16</td>
   </tr>
   <tr id="2655">
     <td><a href="https://cplusplus.github.io/CWG/issues/2655.html">2655</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Instantiation of default arguments in <I>lambda-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2656">
     <td><a href="https://cplusplus.github.io/CWG/issues/2656.html">2656</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>drafting</td>
     <td>Converting consteval lambda to function pointer in non-immediate context</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2657">
     <td><a href="https://cplusplus.github.io/CWG/issues/2657.html">2657</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Cv-qualification adjustment when binding reference to temporary</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2658">
     <td><a href="https://cplusplus.github.io/CWG/issues/2658.html">2658</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>C++23</td>
     <td>Trivial copying of unions in core constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2659">
     <td><a href="https://cplusplus.github.io/CWG/issues/2659.html">2659</a></td>
+    <td>[<a href="https://wg21.link/cpp.predefined">cpp.predefined</a>]</td>
     <td>C++23</td>
     <td>Missing feature-test macro for lifetime extension in range-for loop</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2660">
     <td><a href="https://cplusplus.github.io/CWG/issues/2660.html">2660</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Confusing term "this parameter"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2661">
     <td><a href="https://cplusplus.github.io/CWG/issues/2661.html">2661</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>CD7</td>
     <td>Missing disambiguation rule for <I>pure-specifier</I> vs. <I>brace-or-equal-initializer</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2662">
     <td><a href="https://cplusplus.github.io/CWG/issues/2662.html">2662</a></td>
+    <td>[<a href="https://wg21.link/class.access.general">class.access.general</a>]</td>
     <td>C++23</td>
     <td>Example for member access control vs. overload resolution</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2663">
     <td><a href="https://cplusplus.github.io/CWG/issues/2663.html">2663</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>CD7</td>
     <td>Example for member redeclarations with <I>using-declaration</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2664">
     <td><a href="https://cplusplus.github.io/CWG/issues/2664.html">2664</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Deduction failure in CTAD for alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2665">
     <td><a href="https://cplusplus.github.io/CWG/issues/2665.html">2665</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>NAD</td>
     <td>Replacing a subobject with a complete object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2666">
     <td><a href="https://cplusplus.github.io/CWG/issues/2666.html">2666</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension through <TT>static_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2667">
     <td><a href="https://cplusplus.github.io/CWG/issues/2667.html">2667</a></td>
+    <td>[<a href="https://wg21.link/cpp.import">cpp.import</a>]</td>
     <td>C++23</td>
     <td>Named module imports do not import macros</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2668">
     <td><a href="https://cplusplus.github.io/CWG/issues/2668.html">2668</a></td>
+    <td>[<a href="https://wg21.link/expr.await">expr.await</a>]</td>
     <td>CD7</td>
     <td><TT>co_await</TT> in a <I>lambda-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2669">
     <td><a href="https://cplusplus.github.io/CWG/issues/2669.html">2669</a></td>
+    <td>[<a href="https://wg21.link/class.base.init">class.base.init</a>]</td>
     <td>open</td>
     <td>Lifetime extension for aggregate initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2670">
     <td><a href="https://cplusplus.github.io/CWG/issues/2670.html">2670</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Programs and translation units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2671">
     <td><a href="https://cplusplus.github.io/CWG/issues/2671.html">2671</a></td>
+    <td>[<a href="https://wg21.link/dcl.meaning.general">dcl.meaning.general</a>]</td>
     <td>open</td>
     <td>friend named by a <I>template-id</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2672">
     <td><a href="https://cplusplus.github.io/CWG/issues/2672.html">2672</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>CD7</td>
     <td>Lambda body SFINAE is still required, contrary to intent and note</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2673">
     <td><a href="https://cplusplus.github.io/CWG/issues/2673.html">2673</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>C++23</td>
     <td>User-declared spaceship vs. built-in operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2674">
     <td><a href="https://cplusplus.github.io/CWG/issues/2674.html">2674</a></td>
+    <td>[<a href="https://wg21.link/class.ctor.general">class.ctor.general</a>]</td>
     <td>C++23</td>
     <td>Prohibit explicit object parameters for constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2675">
     <td><a href="https://cplusplus.github.io/CWG/issues/2675.html">2675</a></td>
+    <td>[<a href="https://wg21.link/class.union.general">class.union.general</a>]</td>
     <td>open</td>
     <td><TT>start_lifetime_as</TT>, placement-new, and active union members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2676">
     <td><a href="https://cplusplus.github.io/CWG/issues/2676.html">2676</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Replacing a complete object having base subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2677">
     <td><a href="https://cplusplus.github.io/CWG/issues/2677.html">2677</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Replacing union subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2678">
     <td><a href="https://cplusplus.github.io/CWG/issues/2678.html">2678</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>C++23</td>
     <td><TT>std::source_location::current</TT> is unimplementable</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2679">
     <td><a href="https://cplusplus.github.io/CWG/issues/2679.html">2679</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Implicit conversion sequence with a null pointer constant</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2680">
     <td><a href="https://cplusplus.github.io/CWG/issues/2680.html">2680</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>open</td>
     <td>Class template argument deduction for aggregates with designated initializers</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2681">
     <td><a href="https://cplusplus.github.io/CWG/issues/2681.html">2681</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Deducing member array type from string literal</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr id="2682">
     <td><a href="https://cplusplus.github.io/CWG/issues/2682.html">2682</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>C++23</td>
     <td>Templated function vs. function template</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2683">
     <td><a href="https://cplusplus.github.io/CWG/issues/2683.html">2683</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>CD7</td>
     <td>Default arguments for member functions of templated nested classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2684">
     <td><a href="https://cplusplus.github.io/CWG/issues/2684.html">2684</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>open</td>
     <td>thread_local dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2685">
     <td><a href="https://cplusplus.github.io/CWG/issues/2685.html">2685</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>C++23</td>
     <td>Aggregate CTAD, string, and brace elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2686">
     <td><a href="https://cplusplus.github.io/CWG/issues/2686.html">2686</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.constr">temp.constr.constr</a>]</td>
     <td>open</td>
     <td>Pack expansion into a non-pack parameter of a concept</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2687">
     <td><a href="https://cplusplus.github.io/CWG/issues/2687.html">2687</a></td>
+    <td>[<a href="https://wg21.link/over.match.call.general">over.match.call.general</a>]</td>
     <td>C++23</td>
     <td>Calling an explicit object member function via an address-of-overload-set</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2688">
     <td><a href="https://cplusplus.github.io/CWG/issues/2688.html">2688</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Calling explicit object member functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2689">
     <td><a href="https://cplusplus.github.io/CWG/issues/2689.html">2689</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD7</td>
     <td>Are cv-qualified <TT>std::nullptr_t</TT> fundamental types?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2690">
     <td><a href="https://cplusplus.github.io/CWG/issues/2690.html">2690</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>C++23</td>
     <td>Semantics of defaulted move assignment operator for unions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2691">
     <td><a href="https://cplusplus.github.io/CWG/issues/2691.html">2691</a></td>
+    <td>[<a href="https://wg21.link/lex.ccon">lex.ccon</a>]</td>
     <td>C++23</td>
     <td><I>hexadecimal-escape-sequence</I> is too greedy</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2692">
     <td><a href="https://cplusplus.github.io/CWG/issues/2692.html">2692</a></td>
+    <td>[<a href="https://wg21.link/over.match.call.general">over.match.call.general</a>]</td>
     <td>C++23</td>
     <td>Static and explicit object member functions with the same parameter-type-lists</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2693">
     <td><a href="https://cplusplus.github.io/CWG/issues/2693.html">2693</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>open</td>
     <td>Escape sequences for the <I>string-literal</I> of <TT>#line</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2694">
     <td><a href="https://cplusplus.github.io/CWG/issues/2694.html">2694</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td><I>string-literal</I>s of the <TT>_Pragma</TT> operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2695">
     <td><a href="https://cplusplus.github.io/CWG/issues/2695.html">2695</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>C++23</td>
     <td>Semantic ignorability of attributes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2696">
     <td><a href="https://cplusplus.github.io/CWG/issues/2696.html">2696</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>dup</td>
     <td>Relational comparisons of pointers to <TT>void</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2697">
     <td><a href="https://cplusplus.github.io/CWG/issues/2697.html">2697</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Deduction guides using abbreviated function syntax</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2698">
     <td><a href="https://cplusplus.github.io/CWG/issues/2698.html">2698</a></td>
+    <td>[<a href="https://wg21.link/lex.icon">lex.icon</a>]</td>
     <td>CD7</td>
     <td>Using extended integer types with <TT>z</TT> suffix</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2699">
     <td><a href="https://cplusplus.github.io/CWG/issues/2699.html">2699</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Inconsistency of <I>throw-expression</I> specification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2700">
     <td><a href="https://cplusplus.github.io/CWG/issues/2700.html">2700</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>CD7</td>
     <td><TT>#error</TT> disallows existing implementation practice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2701">
     <td><a href="https://cplusplus.github.io/CWG/issues/2701.html">2701</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Default arguments in multiple scopes / inheritance of array bounds in the same scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2702">
     <td><a href="https://cplusplus.github.io/CWG/issues/2702.html">2702</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Constant destruction of reference members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2703">
     <td><a href="https://cplusplus.github.io/CWG/issues/2703.html">2703</a></td>
+    <td>[<a href="https://wg21.link/class.spaceship">class.spaceship</a>]</td>
     <td>CD7</td>
     <td>Three-way comparison requiring strong ordering for floating-point types, take 2</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2704">
     <td><a href="https://cplusplus.github.io/CWG/issues/2704.html">2704</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>open</td>
     <td>Clarify meaning of "bind directly"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2705">
     <td><a href="https://cplusplus.github.io/CWG/issues/2705.html">2705</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Accessing ambiguous subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2706">
     <td><a href="https://cplusplus.github.io/CWG/issues/2706.html">2706</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Repeated structured binding declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2707">
     <td><a href="https://cplusplus.github.io/CWG/issues/2707.html">2707</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Deduction guides cannot have a trailing <I>requires-clause</I></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2708">
     <td><a href="https://cplusplus.github.io/CWG/issues/2708.html">2708</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Parenthesized initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2709">
     <td><a href="https://cplusplus.github.io/CWG/issues/2709.html">2709</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>NAD</td>
     <td>Parenthesized initialization of reference-to-aggregate</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2710">
     <td><a href="https://cplusplus.github.io/CWG/issues/2710.html">2710</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Loops in constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2711">
     <td><a href="https://cplusplus.github.io/CWG/issues/2711.html">2711</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Source for copy-initializing the exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2712">
     <td><a href="https://cplusplus.github.io/CWG/issues/2712.html">2712</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>CD7</td>
     <td>Simplify restrictions on built-in assignment operator candidates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2713">
     <td><a href="https://cplusplus.github.io/CWG/issues/2713.html">2713</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Initialization of reference-to-aggregate from designated initializer list</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2714">
     <td><a href="https://cplusplus.github.io/CWG/issues/2714.html">2714</a></td>
+    <td>[<a href="https://wg21.link/over.match.class.deduct">over.match.class.deduct</a>]</td>
     <td>CD7</td>
     <td>Implicit deduction guides omit properties from the parameter-declaration-clause of a constructor</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2715">
     <td><a href="https://cplusplus.github.io/CWG/issues/2715.html">2715</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>CD7</td>
     <td>"calling function" for parameter initialization may not exist</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2716">
     <td><a href="https://cplusplus.github.io/CWG/issues/2716.html">2716</a></td>
+    <td>[<a href="https://wg21.link/class.conv.fct">class.conv.fct</a>]</td>
     <td>CD7</td>
     <td>Rule about self-or-base conversion is normatively redundant</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2717">
     <td><a href="https://cplusplus.github.io/CWG/issues/2717.html">2717</a></td>
+    <td>[<a href="https://wg21.link/temp.variadic">temp.variadic</a>]</td>
     <td>CD7</td>
     <td>Pack expansion for <I>alignment-specifier</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2718">
     <td><a href="https://cplusplus.github.io/CWG/issues/2718.html">2718</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Type completeness for derived-to-base conversions</td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2719">
     <td><a href="https://cplusplus.github.io/CWG/issues/2719.html">2719</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>CD7</td>
     <td>Creating objects in misaligned storage</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2720">
     <td><a href="https://cplusplus.github.io/CWG/issues/2720.html">2720</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Template validity rules for templated entities and alias templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2721">
     <td><a href="https://cplusplus.github.io/CWG/issues/2721.html">2721</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>CD7</td>
     <td>When exactly is storage reused?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2722">
     <td><a href="https://cplusplus.github.io/CWG/issues/2722.html">2722</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD7</td>
     <td>Temporary materialization conversion for <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2723">
     <td><a href="https://cplusplus.github.io/CWG/issues/2723.html">2723</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>CD7</td>
     <td>Range of representable values for floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2724">
     <td><a href="https://cplusplus.github.io/CWG/issues/2724.html">2724</a></td>
+    <td>[<a href="https://wg21.link/expr.shift">expr.shift</a>]</td>
     <td>CD7</td>
     <td>Clarify rounding for arithmetic right shift</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2725">
     <td><a href="https://cplusplus.github.io/CWG/issues/2725.html">2725</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for non-call of class member access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2726">
     <td><a href="https://cplusplus.github.io/CWG/issues/2726.html">2726</a></td>
+    <td>[<a href="https://wg21.link/lex.digraph">lex.digraph</a>]</td>
     <td>review</td>
     <td>Alternative tokens appearing as <I>attribute-token</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2727">
     <td><a href="https://cplusplus.github.io/CWG/issues/2727.html">2727</a></td>
+    <td>[<a href="https://wg21.link/module.import">module.import</a>]</td>
     <td>open</td>
     <td>Importing header units synthesized from source files</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2728">
     <td><a href="https://cplusplus.github.io/CWG/issues/2728.html">2728</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>Evaluation of conversions in a <I>delete-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2729">
     <td><a href="https://cplusplus.github.io/CWG/issues/2729.html">2729</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>CD7</td>
     <td>Meaning of <I>new-type-id</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2730">
     <td><a href="https://cplusplus.github.io/CWG/issues/2730.html">2730</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Comparison templates on enumeration types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2731">
     <td><a href="https://cplusplus.github.io/CWG/issues/2731.html">2731</a></td>
+    <td>[<a href="https://wg21.link/over.ics.user">over.ics.user</a>]</td>
     <td>open</td>
     <td>List-initialization sequence with a user-defined conversion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2732">
     <td><a href="https://cplusplus.github.io/CWG/issues/2732.html">2732</a></td>
+    <td>[<a href="https://wg21.link/module.import">module.import</a>]</td>
     <td>CD7</td>
     <td>Can importable headers react to preprocessor state from point of import?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2733">
     <td><a href="https://cplusplus.github.io/CWG/issues/2733.html">2733</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.unused">dcl.attr.unused</a>]</td>
     <td>CD7</td>
     <td>Applying <TT>[[maybe_unused]]</TT> to a label</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2734">
     <td><a href="https://cplusplus.github.io/CWG/issues/2734.html">2734</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Immediate forward-declared function templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2735">
     <td><a href="https://cplusplus.github.io/CWG/issues/2735.html">2735</a></td>
+    <td>[<a href="https://wg21.link/over.match.best">over.match.best</a>]</td>
     <td>open</td>
     <td>List-initialization and conversions in overload resolution</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2736">
     <td><a href="https://cplusplus.github.io/CWG/issues/2736.html">2736</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Standard layout class with empty base class also in first member</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2737">
     <td><a href="https://cplusplus.github.io/CWG/issues/2737.html">2737</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.capture">expr.prim.lambda.capture</a>]</td>
     <td>review</td>
     <td>Temporary lifetime extension for reference init-captures</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2738">
     <td><a href="https://cplusplus.github.io/CWG/issues/2738.html">2738</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.unqual">expr.prim.id.unqual</a>]</td>
     <td>review</td>
     <td>"denotes a destructor" is missing specification</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2739">
     <td><a href="https://cplusplus.github.io/CWG/issues/2739.html">2739</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.nested">expr.prim.req.nested</a>]</td>
     <td>open</td>
     <td>Nested requirement not a constant expression</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2740">
     <td><a href="https://cplusplus.github.io/CWG/issues/2740.html">2740</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Too many objects have constexpr-unknown type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2741">
     <td><a href="https://cplusplus.github.io/CWG/issues/2741.html">2741</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Implicit conversion sequence from empty list to array of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2742">
     <td><a href="https://cplusplus.github.io/CWG/issues/2742.html">2742</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>drafting</td>
     <td>Guaranteed copy elision for brace-initialization from prvalue</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2743">
     <td><a href="https://cplusplus.github.io/CWG/issues/2743.html">2743</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Copying non-trivial objects nested within a union</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2744">
     <td><a href="https://cplusplus.github.io/CWG/issues/2744.html">2744</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Multiple objects of the same type at the same address</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2745">
     <td><a href="https://cplusplus.github.io/CWG/issues/2745.html">2745</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Dependent odr-use in generic lambdas</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2746">
     <td><a href="https://cplusplus.github.io/CWG/issues/2746.html">2746</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Checking of default template arguments</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2747">
     <td><a href="https://cplusplus.github.io/CWG/issues/2747.html">2747</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Cannot depend on an already-deleted splice</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2748">
     <td><a href="https://cplusplus.github.io/CWG/issues/2748.html">2748</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Accessing static data members via null pointer</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2749">
     <td><a href="https://cplusplus.github.io/CWG/issues/2749.html">2749</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD7</td>
     <td>Treatment of "pointer to void" for relational comparisons</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2750">
     <td><a href="https://cplusplus.github.io/CWG/issues/2750.html">2750</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>construct_at without constructor call</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2751">
     <td><a href="https://cplusplus.github.io/CWG/issues/2751.html">2751</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>NAD</td>
     <td>Order of destruction for parameters for operator functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2752">
     <td><a href="https://cplusplus.github.io/CWG/issues/2752.html">2752</a></td>
+    <td>[<a href="https://wg21.link/lex.fcon">lex.fcon</a>]</td>
     <td>open</td>
     <td>Excess-precision floating-point literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2753">
     <td><a href="https://cplusplus.github.io/CWG/issues/2753.html">2753</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD7</td>
     <td>Storage reuse for string literal objects and backing arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2754">
     <td><a href="https://cplusplus.github.io/CWG/issues/2754.html">2754</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>CD7</td>
     <td>Using *this in explicit object member functions that are coroutines</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2755">
     <td><a href="https://cplusplus.github.io/CWG/issues/2755.html">2755</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Incorrect wording applied by P2738R1</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2756">
     <td><a href="https://cplusplus.github.io/CWG/issues/2756.html">2756</a></td>
+    <td>[<a href="https://wg21.link/class.init">class.init</a>]</td>
     <td>review</td>
     <td>Completion of initialization by delegating constructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2757">
     <td><a href="https://cplusplus.github.io/CWG/issues/2757.html">2757</a></td>
+    <td>[<a href="https://wg21.link/class.cdtor">class.cdtor</a>]</td>
     <td>review</td>
     <td>Deleting or deallocating storage of an object during its construction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2758">
     <td><a href="https://cplusplus.github.io/CWG/issues/2758.html">2758</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>What is "access and ambiguity control"?</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2759">
     <td><a href="https://cplusplus.github.io/CWG/issues/2759.html">2759</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>CD7</td>
     <td>[[no_unique_address] and common initial sequence</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2760">
     <td><a href="https://cplusplus.github.io/CWG/issues/2760.html">2760</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Defaulted constructor that is an immediate function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2761">
     <td><a href="https://cplusplus.github.io/CWG/issues/2761.html">2761</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD7</td>
     <td>Implicitly invoking the deleted destructor of an anonymous union member</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2762">
     <td><a href="https://cplusplus.github.io/CWG/issues/2762.html">2762</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs.general">over.match.funcs.general</a>]</td>
     <td>CD7</td>
     <td>Type of implicit object parameter</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2763">
     <td><a href="https://cplusplus.github.io/CWG/issues/2763.html">2763</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Ignorability of [[noreturn]] during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2764">
     <td><a href="https://cplusplus.github.io/CWG/issues/2764.html">2764</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>CD7</td>
     <td>Use of placeholders affecting name mangling</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2765">
     <td><a href="https://cplusplus.github.io/CWG/issues/2765.html">2765</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Address comparisons between potentially non-unique objects during constant evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2766">
     <td><a href="https://cplusplus.github.io/CWG/issues/2766.html">2766</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>open</td>
-    <td>Repeated evaluation of a <I>string-literal</I> may yield different
-objects</td>
+    <td>Repeated evaluation of a <I>string-literal</I> may yield different objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2767">
     <td><a href="https://cplusplus.github.io/CWG/issues/2767.html">2767</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Non-defining declarations of anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2768">
     <td><a href="https://cplusplus.github.io/CWG/issues/2768.html">2768</a></td>
+    <td>[<a href="https://wg21.link/expr.assign">expr.assign</a>]</td>
     <td>CD7</td>
     <td>Assignment to enumeration variable with a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2769">
     <td><a href="https://cplusplus.github.io/CWG/issues/2769.html">2769</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Substitution into template parameters and default template arguments should be interleaved</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2770">
     <td><a href="https://cplusplus.github.io/CWG/issues/2770.html">2770</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Trailing <I>requires-clause</I> can refer to function parameters before they are substituted into</td>
     <td align="center">
@@ -16476,462 +19203,539 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2771">
     <td><a href="https://cplusplus.github.io/CWG/issues/2771.html">2771</a></td>
+    <td>[<a href="https://wg21.link/class.mfct.non.static">class.mfct.non.static</a>]</td>
     <td>CD7</td>
     <td>Transformation for <I>unqualified-id</I>s in address operator</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr id="2772">
     <td><a href="https://cplusplus.github.io/CWG/issues/2772.html">2772</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.dcl.dcl">diff.cpp03.dcl.dcl</a>]</td>
     <td>CD7</td>
     <td>Missing Annex C entry for linkage effects of <I>linkage-specification</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2773">
     <td><a href="https://cplusplus.github.io/CWG/issues/2773.html">2773</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Naming anonymous union members as class members</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2774">
     <td><a href="https://cplusplus.github.io/CWG/issues/2774.html">2774</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependence of <I>requires-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2775">
     <td><a href="https://cplusplus.github.io/CWG/issues/2775.html">2775</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD7</td>
     <td>Unclear argument type for copy of exception object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2776">
     <td><a href="https://cplusplus.github.io/CWG/issues/2776.html">2776</a></td>
+    <td>[<a href="https://wg21.link/intro.compliance.general">intro.compliance.general</a>]</td>
     <td>open</td>
     <td>Substitution failure and implementation limits</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2777">
     <td><a href="https://cplusplus.github.io/CWG/issues/2777.html">2777</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>CD7</td>
     <td>Type of <I>id-expression</I> denoting a template parameter object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2778">
     <td><a href="https://cplusplus.github.io/CWG/issues/2778.html">2778</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>review</td>
     <td>Trivial destructor does not imply constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2779">
     <td><a href="https://cplusplus.github.io/CWG/issues/2779.html">2779</a></td>
+    <td>[<a href="https://wg21.link/lex.charset">lex.charset</a>]</td>
     <td>open</td>
     <td>Restrictions on the ordinary literal encoding</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2780">
     <td><a href="https://cplusplus.github.io/CWG/issues/2780.html">2780</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD7</td>
     <td><TT>reinterpret_cast</TT> to reference to function types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2781">
     <td><a href="https://cplusplus.github.io/CWG/issues/2781.html">2781</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Unclear recursion in the one-definition rule</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2782">
     <td><a href="https://cplusplus.github.io/CWG/issues/2782.html">2782</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>open</td>
     <td>Treatment of closure types in the one-definition rule</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2783">
     <td><a href="https://cplusplus.github.io/CWG/issues/2783.html">2783</a></td>
+    <td>[<a href="https://wg21.link/module.global.frag">module.global.frag</a>]</td>
     <td>CD7</td>
     <td>Handling of deduction guides in <I>global-module-fragment</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2784">
     <td><a href="https://cplusplus.github.io/CWG/issues/2784.html">2784</a></td>
+    <td>[<a href="https://wg21.link/support.types.layout">support.types.layout</a>]</td>
     <td>open</td>
     <td>Unclear definition of <I>member-designator</I> for <TT>offsetof</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2785">
     <td><a href="https://cplusplus.github.io/CWG/issues/2785.html">2785</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>CD7</td>
     <td>Type-dependence of <I>requires-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2786">
     <td><a href="https://cplusplus.github.io/CWG/issues/2786.html">2786</a></td>
+    <td>[<a href="https://wg21.link/expr.eq">expr.eq</a>]</td>
     <td>open</td>
     <td>Comparing pointers to complete objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2787">
     <td><a href="https://cplusplus.github.io/CWG/issues/2787.html">2787</a></td>
+    <td>[<a href="https://wg21.link/special">special</a>]</td>
     <td>open</td>
     <td>Kind of explicit object copy/move assignment function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2788">
     <td><a href="https://cplusplus.github.io/CWG/issues/2788.html">2788</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Correspondence and redeclarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2789">
     <td><a href="https://cplusplus.github.io/CWG/issues/2789.html">2789</a></td>
+    <td>[<a href="https://wg21.link/over.match.best.general">over.match.best.general</a>]</td>
     <td>CD7</td>
     <td>Overload resolution with implicit and explicit object member functions</td>
     <td class="full" align="center">Clang 18</td>
   </tr>
   <tr class="open" id="2790">
     <td><a href="https://cplusplus.github.io/CWG/issues/2790.html">2790</a></td>
+    <td>[<a href="https://wg21.link/over.ics.list">over.ics.list</a>]</td>
     <td>open</td>
     <td>Aggregate initialization and user-defined conversion sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2791">
     <td><a href="https://cplusplus.github.io/CWG/issues/2791.html">2791</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>CD7</td>
     <td>Unclear phrasing about "returning to the caller"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2792">
     <td><a href="https://cplusplus.github.io/CWG/issues/2792.html">2792</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.noexcept">expr.unary.noexcept</a>]</td>
     <td>CD7</td>
     <td>Clean up specification of <TT>noexcept</TT> operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2793">
     <td><a href="https://cplusplus.github.io/CWG/issues/2793.html">2793</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>CD7</td>
     <td>Block-scope declaration conflicting with parameter name</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2794">
     <td><a href="https://cplusplus.github.io/CWG/issues/2794.html">2794</a></td>
+    <td>[<a href="https://wg21.link/temp.alias">temp.alias</a>]</td>
     <td>open</td>
     <td>Uniqueness of lambdas in alias templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2795">
     <td><a href="https://cplusplus.github.io/CWG/issues/2795.html">2795</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>CD7</td>
     <td>Overlapping empty subobjects with different cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2796">
     <td><a href="https://cplusplus.github.io/CWG/issues/2796.html">2796</a></td>
+    <td>[<a href="https://wg21.link/expr.rel">expr.rel</a>]</td>
     <td>CD7</td>
     <td>Function pointer conversions for relational operators</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2797">
     <td><a href="https://cplusplus.github.io/CWG/issues/2797.html">2797</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>review</td>
     <td>Meaning of "corresponds" for rewritten operator candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2798">
     <td><a href="https://cplusplus.github.io/CWG/issues/2798.html">2798</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Manifestly constant evaluation of the <TT>static_assert</TT> message</td>
     <td class="full" align="center">Clang 17</td>
   </tr>
   <tr class="open" id="2799">
     <td><a href="https://cplusplus.github.io/CWG/issues/2799.html">2799</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>drafting</td>
     <td>Inheriting default constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2800">
     <td><a href="https://cplusplus.github.io/CWG/issues/2800.html">2800</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>review</td>
     <td>Instantiating constexpr variables for potential constant evaluation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2801">
     <td><a href="https://cplusplus.github.io/CWG/issues/2801.html">2801</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Reference binding with reference-related types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2802">
     <td><a href="https://cplusplus.github.io/CWG/issues/2802.html">2802</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Constrained <TT>auto</TT> and redeclaration with non-abbreviated syntax</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2803">
     <td><a href="https://cplusplus.github.io/CWG/issues/2803.html">2803</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for reference binding of similar types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2804">
     <td><a href="https://cplusplus.github.io/CWG/issues/2804.html">2804</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Lookup for determining rewrite targets</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2805">
     <td><a href="https://cplusplus.github.io/CWG/issues/2805.html">2805</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Underspecified selection of deallocation function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2806">
     <td><a href="https://cplusplus.github.io/CWG/issues/2806.html">2806</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Make a <I>type-requirement</I> a type-only context</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2807">
     <td><a href="https://cplusplus.github.io/CWG/issues/2807.html">2807</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>CD7</td>
     <td>Destructors declared <TT>consteval</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2808">
     <td><a href="https://cplusplus.github.io/CWG/issues/2808.html">2808</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>review</td>
     <td>Explicit specialization of defaulted special member function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2809">
     <td><a href="https://cplusplus.github.io/CWG/issues/2809.html">2809</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>CD7</td>
     <td>An implicit definition does not redeclare a function</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2810">
     <td><a href="https://cplusplus.github.io/CWG/issues/2810.html">2810</a></td>
+    <td>[<a href="https://wg21.link/temp.res.general">temp.res.general</a>]</td>
     <td>CD7</td>
     <td>Requiring the absence of diagnostics for templates</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2811">
     <td><a href="https://cplusplus.github.io/CWG/issues/2811.html">2811</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>CD7</td>
     <td>Clarify "use" of main</td>
     <td class="full" align="center">Clang 3.5</td>
   </tr>
   <tr class="open" id="2812">
     <td><a href="https://cplusplus.github.io/CWG/issues/2812.html">2812</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Allocation with explicit alignment</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2813">
     <td><a href="https://cplusplus.github.io/CWG/issues/2813.html">2813</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>CD7</td>
     <td>Class member access with prvalues</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr id="2814">
     <td><a href="https://cplusplus.github.io/CWG/issues/2814.html">2814</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>NAD</td>
     <td>Alignment requirement of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2815">
     <td><a href="https://cplusplus.github.io/CWG/issues/2815.html">2815</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>CD7</td>
     <td>Overload resolution for references/pointers to <TT>noexcept</TT> functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2816">
     <td><a href="https://cplusplus.github.io/CWG/issues/2816.html">2816</a></td>
+    <td>[<a href="https://wg21.link/intro.progress">intro.progress</a>]</td>
     <td>review</td>
     <td>Unclear phrasing "may assume ... eventually"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2817">
     <td><a href="https://cplusplus.github.io/CWG/issues/2817.html">2817</a></td>
+    <td>[<a href="https://wg21.link/expr.sizeof">expr.sizeof</a>]</td>
     <td>open</td>
     <td>sizeof(abstract class) is underspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2818">
     <td><a href="https://cplusplus.github.io/CWG/issues/2818.html">2818</a></td>
+    <td>[<a href="https://wg21.link/lex.name">lex.name</a>]</td>
     <td>CD7</td>
     <td>Use of predefined reserved identifiers</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2819">
     <td><a href="https://cplusplus.github.io/CWG/issues/2819.html">2819</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Cast from null pointer value in a constant expression</td>
     <td class="full" align="center">Clang 19 (C++26 onwards)</td>
   </tr>
   <tr id="2820">
     <td><a href="https://cplusplus.github.io/CWG/issues/2820.html">2820</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Value-initialization and default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2821">
     <td><a href="https://cplusplus.github.io/CWG/issues/2821.html">2821</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>review</td>
     <td>Lifetime, zero-initialization, and dynamic initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2822">
     <td><a href="https://cplusplus.github.io/CWG/issues/2822.html">2822</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.general">basic.stc.general</a>]</td>
     <td>CD7</td>
     <td>Side-effect-free pointer zap</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2823">
     <td><a href="https://cplusplus.github.io/CWG/issues/2823.html">2823</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>CD7</td>
     <td>Implicit undefined behavior when dereferencing pointers</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2824">
     <td><a href="https://cplusplus.github.io/CWG/issues/2824.html">2824</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Copy-initialization of arrays</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2825">
     <td><a href="https://cplusplus.github.io/CWG/issues/2825.html">2825</a></td>
+    <td>[<a href="https://wg21.link/stmt.ranged">stmt.ranged</a>]</td>
     <td>CD7</td>
     <td>Range-based for statement using a <I>braced-init-list</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2826">
     <td><a href="https://cplusplus.github.io/CWG/issues/2826.html">2826</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>drafting</td>
     <td>Missing definition of "temporary expression"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2827">
     <td><a href="https://cplusplus.github.io/CWG/issues/2827.html">2827</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>review</td>
     <td>Representation of unsigned integral types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2828">
     <td><a href="https://cplusplus.github.io/CWG/issues/2828.html">2828</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>CD7</td>
     <td>Ambiguous interpretation of C-style cast</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2829">
     <td><a href="https://cplusplus.github.io/CWG/issues/2829.html">2829</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>open</td>
     <td>Redundant case in restricting user-defined conversion sequences</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2830">
     <td><a href="https://cplusplus.github.io/CWG/issues/2830.html">2830</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Top-level cv-qualification should be ignored for list-initialization</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2831">
     <td><a href="https://cplusplus.github.io/CWG/issues/2831.html">2831</a></td>
+    <td>[<a href="https://wg21.link/dcl.decl.general">dcl.decl.general</a>]</td>
     <td>CD7</td>
     <td>Non-templated function definitions and <I>requires-clause</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2832">
     <td><a href="https://cplusplus.github.io/CWG/issues/2832.html">2832</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Invented temporary variables and temporary objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2833">
     <td><a href="https://cplusplus.github.io/CWG/issues/2833.html">2833</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>review</td>
     <td>Evaluation of odr-use</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2834">
     <td><a href="https://cplusplus.github.io/CWG/issues/2834.html">2834</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>review</td>
     <td>Partial ordering and explicit object parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2835">
     <td><a href="https://cplusplus.github.io/CWG/issues/2835.html">2835</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>open</td>
     <td>Name-independent declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2836">
     <td><a href="https://cplusplus.github.io/CWG/issues/2836.html">2836</a></td>
+    <td>[<a href="https://wg21.link/conv.rank">conv.rank</a>]</td>
     <td>CD7</td>
     <td>Conversion rank of <TT>long double</TT> and extended floating-point types</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2837">
     <td><a href="https://cplusplus.github.io/CWG/issues/2837.html">2837</a></td>
+    <td>[<a href="https://wg21.link/class.copy.ctor">class.copy.ctor</a>]</td>
     <td>open</td>
     <td>Instantiating and inheriting by-value copy constructors</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2838">
     <td><a href="https://cplusplus.github.io/CWG/issues/2838.html">2838</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>open</td>
     <td>Declaration conflicts in <I>lambda-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2839">
     <td><a href="https://cplusplus.github.io/CWG/issues/2839.html">2839</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Explicit destruction of base classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2840">
     <td><a href="https://cplusplus.github.io/CWG/issues/2840.html">2840</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>open</td>
     <td>Missing requirements for fundamental alignments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2841">
     <td><a href="https://cplusplus.github.io/CWG/issues/2841.html">2841</a></td>
+    <td>[<a href="https://wg21.link/class.ctor.general">class.ctor.general</a>]</td>
     <td>open</td>
     <td>When do const objects start being const?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2842">
     <td><a href="https://cplusplus.github.io/CWG/issues/2842.html">2842</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Preferring an <TT>initializer_list</TT> over a single value</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2843">
     <td><a href="https://cplusplus.github.io/CWG/issues/2843.html">2843</a></td>
+    <td>[<a href="https://wg21.link/intro.refs">intro.refs</a>]</td>
     <td>CD7</td>
     <td>Undated reference to Unicode makes C++ a moving target</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2844">
     <td><a href="https://cplusplus.github.io/CWG/issues/2844.html">2844</a></td>
+    <td>[<a href="https://wg21.link/over.match.oper">over.match.oper</a>]</td>
     <td>open</td>
     <td>Enumerating a finite set of built-in candidates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2845">
     <td><a href="https://cplusplus.github.io/CWG/issues/2845.html">2845</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Make the closure type of a captureless lambda a structural type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2846">
     <td><a href="https://cplusplus.github.io/CWG/issues/2846.html">2846</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD7</td>
     <td>Out-of-class definitions of explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2847">
     <td><a href="https://cplusplus.github.io/CWG/issues/2847.html">2847</a></td>
+    <td>[<a href="https://wg21.link/temp.expl.spec">temp.expl.spec</a>]</td>
     <td>review</td>
     <td>Constrained explicit specializations of function templates at class scope</td>
     <td align="center">
@@ -16942,228 +19746,266 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2848">
     <td><a href="https://cplusplus.github.io/CWG/issues/2848.html">2848</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>CD7</td>
     <td>Omitting an empty template argument list for explicit instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2849">
     <td><a href="https://cplusplus.github.io/CWG/issues/2849.html">2849</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD7</td>
     <td>Parameter objects are not temporary objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2850">
     <td><a href="https://cplusplus.github.io/CWG/issues/2850.html">2850</a></td>
+    <td>[<a href="https://wg21.link/basic.stc">basic.stc</a>]</td>
     <td>CD7</td>
     <td>Unclear storage duration for function parameter objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2851">
     <td><a href="https://cplusplus.github.io/CWG/issues/2851.html">2851</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Allow floating-point conversions in converted constant expressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2852">
     <td><a href="https://cplusplus.github.io/CWG/issues/2852.html">2852</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Complete-class contexts and class-scope lambdas</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2853">
     <td><a href="https://cplusplus.github.io/CWG/issues/2853.html">2853</a></td>
+    <td>[<a href="https://wg21.link/expr.add">expr.add</a>]</td>
     <td>CD7</td>
     <td>Pointer arithmetic with pointer to hypothetical element</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2854">
     <td><a href="https://cplusplus.github.io/CWG/issues/2854.html">2854</a></td>
+    <td>[<a href="https://wg21.link/except.throw">except.throw</a>]</td>
     <td>CD7</td>
     <td>Storage duration of exception objects</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2855">
     <td><a href="https://cplusplus.github.io/CWG/issues/2855.html">2855</a></td>
+    <td>[<a href="https://wg21.link/expr.post.incr">expr.post.incr</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior in postfix increment</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2856">
     <td><a href="https://cplusplus.github.io/CWG/issues/2856.html">2856</a></td>
+    <td>[<a href="https://wg21.link/over.match.list">over.match.list</a>]</td>
     <td>CD7</td>
     <td>Copy-list-initialization with explicit default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2857">
     <td><a href="https://cplusplus.github.io/CWG/issues/2857.html">2857</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>CD7</td>
     <td>Argument-dependent lookup with incomplete class types</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2858">
     <td><a href="https://cplusplus.github.io/CWG/issues/2858.html">2858</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>CD7</td>
     <td>Declarative <I>nested-name-specifier</I>s and <I>pack-index-specifier</I>s</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2859">
     <td><a href="https://cplusplus.github.io/CWG/issues/2859.html">2859</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Value-initialization with multiple default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2860">
     <td><a href="https://cplusplus.github.io/CWG/issues/2860.html">2860</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>dup</td>
     <td>Remove and fix the term "vacuous initialization"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2861">
     <td><a href="https://cplusplus.github.io/CWG/issues/2861.html">2861</a></td>
+    <td>[<a href="https://wg21.link/expr.dynamic.cast">expr.dynamic.cast</a>]</td>
     <td>CD7</td>
     <td><TT>dynamic_cast</TT> on bad pointer value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2862">
     <td><a href="https://cplusplus.github.io/CWG/issues/2862.html">2862</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>review</td>
     <td>Unclear boundaries of template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2863">
     <td><a href="https://cplusplus.github.io/CWG/issues/2863.html">2863</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>drafting</td>
     <td>Unclear synchronization requirements for object lifetime rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2864">
     <td><a href="https://cplusplus.github.io/CWG/issues/2864.html">2864</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>CD7</td>
     <td>Narrowing floating-point conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2865">
     <td><a href="https://cplusplus.github.io/CWG/issues/2865.html">2865</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD7</td>
     <td>Regression on result of conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2866">
     <td><a href="https://cplusplus.github.io/CWG/issues/2866.html">2866</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr">dcl.attr</a>]</td>
     <td>open</td>
     <td>Observing the effects of <TT>[[no_unique_address]]</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2867">
     <td><a href="https://cplusplus.github.io/CWG/issues/2867.html">2867</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>CD7</td>
     <td>Order of initialization for structured bindings</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2868">
     <td><a href="https://cplusplus.github.io/CWG/issues/2868.html">2868</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Self-references in trivially copyable objects as function return values</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2869">
     <td><a href="https://cplusplus.github.io/CWG/issues/2869.html">2869</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.this">expr.prim.this</a>]</td>
     <td>CD7</td>
     <td><TT>this</TT> in local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2870">
     <td><a href="https://cplusplus.github.io/CWG/issues/2870.html">2870</a></td>
+    <td>[<a href="https://wg21.link/lex.string">lex.string</a>]</td>
     <td>CD7</td>
     <td>Combining absent <I>encoding-prefix</I>es</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2871">
     <td><a href="https://cplusplus.github.io/CWG/issues/2871.html">2871</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>CD7</td>
     <td>User-declared constructor templates inhibiting default constructors</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2872">
     <td><a href="https://cplusplus.github.io/CWG/issues/2872.html">2872</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>CD7</td>
     <td>Linkage and unclear "can be referred to"</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2873">
     <td><a href="https://cplusplus.github.io/CWG/issues/2873.html">2873</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>open</td>
     <td>Taking the address of a function involving template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2874">
     <td><a href="https://cplusplus.github.io/CWG/issues/2874.html">2874</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>CD7</td>
     <td>Qualified declarations of partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2875">
     <td><a href="https://cplusplus.github.io/CWG/issues/2875.html">2875</a></td>
+    <td>[<a href="https://wg21.link/diff.expr">diff.expr</a>]</td>
     <td>tentatively ready</td>
     <td>Missing support for round-tripping null pointer values through indirection/address operators</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2876">
     <td><a href="https://cplusplus.github.io/CWG/issues/2876.html">2876</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>CD7</td>
     <td>Disambiguation of <TT>T x = delete("text")</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2877">
     <td><a href="https://cplusplus.github.io/CWG/issues/2877.html">2877</a></td>
+    <td>[<a href="https://wg21.link/enum.udecl">enum.udecl</a>]</td>
     <td>CD7</td>
     <td>Type-only lookup for <I>using-enum-declarator</I></td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr class="open" id="2878">
     <td><a href="https://cplusplus.github.io/CWG/issues/2878.html">2878</a></td>
+    <td>[<a href="https://wg21.link/expr.cast">expr.cast</a>]</td>
     <td>open</td>
     <td>C-style casts to reference types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2879">
     <td><a href="https://cplusplus.github.io/CWG/issues/2879.html">2879</a></td>
+    <td>[<a href="https://wg21.link/expr.const.cast">expr.const.cast</a>]</td>
     <td>CD7</td>
     <td>Undesired outcomes with <TT>const_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2880">
     <td><a href="https://cplusplus.github.io/CWG/issues/2880.html">2880</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>CD7</td>
     <td>Accessibility check for destructor of incomplete class type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2881">
     <td><a href="https://cplusplus.github.io/CWG/issues/2881.html">2881</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.lambda.closure">expr.prim.lambda.closure</a>]</td>
     <td>CD7</td>
     <td>Type restrictions for the explicit object parameter of a lambda</td>
     <td class="full" align="center">Clang 19</td>
   </tr>
   <tr id="2882">
     <td><a href="https://cplusplus.github.io/CWG/issues/2882.html">2882</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Unclear treatment of conversion to <TT>void</TT></td>
     <td class="full" align="center">Clang 2.7</td>
   </tr>
   <tr id="2883">
     <td><a href="https://cplusplus.github.io/CWG/issues/2883.html">2883</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Definition of "odr-usable" ignores lambda scopes</td>
     <td class="none" align="center">No</td>
   </tr>
   <tr id="2884">
     <td><a href="https://cplusplus.github.io/CWG/issues/2884.html">2884</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.elab">dcl.type.elab</a>]</td>
     <td>dup</td>
     <td>Qualified declarations of partial specializations</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2885">
     <td><a href="https://cplusplus.github.io/CWG/issues/2885.html">2885</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>review</td>
     <td>Non-eligible trivial default constructors</td>
     <td align="center">
@@ -17174,192 +20016,224 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2886">
     <td><a href="https://cplusplus.github.io/CWG/issues/2886.html">2886</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>CD7</td>
     <td>Temporaries and trivial potentially-throwing special member functions</td>
     <td class="full" align="center">Clang 9</td>
   </tr>
   <tr id="2887">
     <td><a href="https://cplusplus.github.io/CWG/issues/2887.html">2887</a></td>
+    <td>[<a href="https://wg21.link/diff.cpp03.expr">diff.cpp03.expr</a>]</td>
     <td>CD7</td>
     <td>Missing compatibility entries for xvalues</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2888">
     <td><a href="https://cplusplus.github.io/CWG/issues/2888.html">2888</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.argdep">basic.lookup.argdep</a>]</td>
     <td>review</td>
     <td>Missing cases for reference and array types for argument-dependent lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2889">
     <td><a href="https://cplusplus.github.io/CWG/issues/2889.html">2889</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>open</td>
     <td>Requiring an accessible destructor for destroying operator delete</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2890">
     <td><a href="https://cplusplus.github.io/CWG/issues/2890.html">2890</a></td>
+    <td>[<a href="https://wg21.link/class.local">class.local</a>]</td>
     <td>CD7</td>
     <td>Defining members of local classes</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2891">
     <td><a href="https://cplusplus.github.io/CWG/issues/2891.html">2891</a></td>
+    <td>[<a href="https://wg21.link/implimits">implimits</a>]</td>
     <td>CD7</td>
     <td>Normative status of implementation limits</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2892">
     <td><a href="https://cplusplus.github.io/CWG/issues/2892.html">2892</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>CD7</td>
     <td>Unclear usual arithmetic conversions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2893">
     <td><a href="https://cplusplus.github.io/CWG/issues/2893.html">2893</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>NAD</td>
     <td>Instantiations in discarded <TT>if constexpr</TT> substatements</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2894">
     <td><a href="https://cplusplus.github.io/CWG/issues/2894.html">2894</a></td>
+    <td>[<a href="https://wg21.link/expr.type.conv">expr.type.conv</a>]</td>
     <td>CD7</td>
     <td>Functional casts create prvalues of reference type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2895">
     <td><a href="https://cplusplus.github.io/CWG/issues/2895.html">2895</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>CD7</td>
     <td>Initialization should ignore the destination type's cv-qualification</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2896">
     <td><a href="https://cplusplus.github.io/CWG/issues/2896.html">2896</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct">temp.deduct</a>]</td>
     <td>review</td>
     <td>Template argument deduction involving exception specifications</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2897">
     <td><a href="https://cplusplus.github.io/CWG/issues/2897.html">2897</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Copying potentially-overlapping union subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2898">
     <td><a href="https://cplusplus.github.io/CWG/issues/2898.html">2898</a></td>
+    <td>[<a href="https://wg21.link/over.best.ics.general">over.best.ics.general</a>]</td>
     <td>CD7</td>
     <td>Clarify implicit conversion sequence from <I>cv</I> <TT>T</TT> to <TT>T</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2899">
     <td><a href="https://cplusplus.github.io/CWG/issues/2899.html">2899</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>CD7</td>
     <td>Bad value representations should cause undefined behavior</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2900">
     <td><a href="https://cplusplus.github.io/CWG/issues/2900.html">2900</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Deduction of non-type template arguments with placeholder types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2901">
     <td><a href="https://cplusplus.github.io/CWG/issues/2901.html">2901</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>CD7</td>
     <td>Unclear semantics for near-match aliased access</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2902">
     <td><a href="https://cplusplus.github.io/CWG/issues/2902.html">2902</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.general">expr.prim.id.general</a>]</td>
     <td>review</td>
     <td>Implicit <TT>this</TT> transformation outside of permitted contexts</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2903">
     <td><a href="https://cplusplus.github.io/CWG/issues/2903.html">2903</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>drafting</td>
     <td>Can we omit the <TT>template</TT> disambiguator in <I>nested-name-specifier</I>s in type-only contexts?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2904">
     <td><a href="https://cplusplus.github.io/CWG/issues/2904.html">2904</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>open</td>
     <td>Introducing <I>template-name</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2905">
     <td><a href="https://cplusplus.github.io/CWG/issues/2905.html">2905</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>CD7</td>
     <td>Value-dependence of <I>noexcept-expression</I></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2906">
     <td><a href="https://cplusplus.github.io/CWG/issues/2906.html">2906</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>CD7</td>
     <td>Lvalue-to-rvalue conversion of class types for conditional operator</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2907">
     <td><a href="https://cplusplus.github.io/CWG/issues/2907.html">2907</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Constant lvalue-to-rvalue conversion on uninitialized <TT>std::nullptr_t</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2908">
     <td><a href="https://cplusplus.github.io/CWG/issues/2908.html">2908</a></td>
+    <td>[<a href="https://wg21.link/cpp.line">cpp.line</a>]</td>
     <td>CD7</td>
     <td>Counting physical source lines for <TT>__LINE__</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2909">
     <td><a href="https://cplusplus.github.io/CWG/issues/2909.html">2909</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>Subtle difference between constant-initialized and constexpr</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2910">
     <td><a href="https://cplusplus.github.io/CWG/issues/2910.html">2910</a></td>
+    <td>[<a href="https://wg21.link/basic.def.odr">basic.def.odr</a>]</td>
     <td>CD7</td>
     <td>Effect of <I>requirement-parameter-list</I>s on odr-usability</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2911">
     <td><a href="https://cplusplus.github.io/CWG/issues/2911.html">2911</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.general">expr.prim.req.general</a>]</td>
     <td>CD7</td>
     <td>Unclear meaning of expressions "appearing within" subexpressions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2912">
     <td><a href="https://cplusplus.github.io/CWG/issues/2912.html">2912</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>open</td>
     <td>Too-large value for size in array new</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2913">
     <td><a href="https://cplusplus.github.io/CWG/issues/2913.html">2913</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.guide">temp.deduct.guide</a>]</td>
     <td>CD7</td>
     <td>Grammar for <I>deduction-guide</I> has <I>requires-clause</I> in the wrong position</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2914">
     <td><a href="https://cplusplus.github.io/CWG/issues/2914.html">2914</a></td>
+    <td>[<a href="https://wg21.link/basic.start.static">basic.start.static</a>]</td>
     <td>review</td>
     <td>Unclear order of initialization of static and thread-local variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2915">
     <td><a href="https://cplusplus.github.io/CWG/issues/2915.html">2915</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>CD7</td>
     <td>Explicit object parameters of type <TT>void</TT></td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2916">
     <td><a href="https://cplusplus.github.io/CWG/issues/2916.html">2916</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial">temp.spec.partial</a>]</td>
     <td>review</td>
     <td>Variable template partial specializations should not be declared <TT>static</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2917">
     <td><a href="https://cplusplus.github.io/CWG/issues/2917.html">2917</a></td>
+    <td>[<a href="https://wg21.link/temp.pre">temp.pre</a>]</td>
     <td>review</td>
     <td>Disallow multiple <I>friend-type-specifier</I>s for a friend template</td>
     <td align="center">
@@ -17370,528 +20244,616 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr id="2918">
     <td><a href="https://cplusplus.github.io/CWG/issues/2918.html">2918</a></td>
+    <td>[<a href="https://wg21.link/over.over">over.over</a>]</td>
     <td>CD7</td>
     <td>Consideration of constraints for address of overloaded function</td>
     <td class="full" align="center">Clang 21</td>
   </tr>
   <tr id="2919">
     <td><a href="https://cplusplus.github.io/CWG/issues/2919.html">2919</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>CD7</td>
     <td>Conversion function candidates for initialization of const lvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2920">
     <td><a href="https://cplusplus.github.io/CWG/issues/2920.html">2920</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>The <TT>template</TT> keyword for base classes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2921">
     <td><a href="https://cplusplus.github.io/CWG/issues/2921.html">2921</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD7</td>
     <td>Exporting redeclarations of entities not attached to a named module</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2922">
     <td><a href="https://cplusplus.github.io/CWG/issues/2922.html">2922</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>CD7</td>
     <td>constexpr placement-new is too permissive</td>
     <td class="full" align="center">Clang 20</td>
   </tr>
   <tr class="open" id="2923">
     <td><a href="https://cplusplus.github.io/CWG/issues/2923.html">2923</a></td>
+    <td>[<a href="https://wg21.link/intro.progress">intro.progress</a>]</td>
     <td>tentatively ready</td>
     <td>Note about infinite loops and execution steps</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2924">
     <td><a href="https://cplusplus.github.io/CWG/issues/2924.html">2924</a></td>
+    <td>[<a href="https://wg21.link/defns.undefined">defns.undefined</a>]</td>
     <td>CD7</td>
     <td>Undefined behavior during constant evaluation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2925">
     <td><a href="https://cplusplus.github.io/CWG/issues/2925.html">2925</a></td>
+    <td>[<a href="https://wg21.link/expr.delete">expr.delete</a>]</td>
     <td>NAD</td>
     <td>Deleting a pointer to an incomplete enumeration type</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2926">
     <td><a href="https://cplusplus.github.io/CWG/issues/2926.html">2926</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>drafting</td>
     <td>Lookup context for dependent qualified names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2927">
     <td><a href="https://cplusplus.github.io/CWG/issues/2927.html">2927</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>CD7</td>
     <td>Unclear status of translation unit with <TT>module</TT> keyword</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2928">
     <td><a href="https://cplusplus.github.io/CWG/issues/2928.html">2928</a></td>
+    <td>[<a href="https://wg21.link/basic.start.dynamic">basic.start.dynamic</a>]</td>
     <td>open</td>
     <td>No ordering for initializing thread-local variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2929">
     <td><a href="https://cplusplus.github.io/CWG/issues/2929.html">2929</a></td>
+    <td>[<a href="https://wg21.link/basic.start.term">basic.start.term</a>]</td>
     <td>review</td>
     <td>Lifetime of trivially-destructible static or thread-local objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2930">
     <td><a href="https://cplusplus.github.io/CWG/issues/2930.html">2930</a></td>
+    <td>[<a href="https://wg21.link/class.copy.elision">class.copy.elision</a>]</td>
     <td>CD7</td>
     <td>Unclear term "copy/move operation" in specification of copy elision</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2931">
     <td><a href="https://cplusplus.github.io/CWG/issues/2931.html">2931</a></td>
+    <td>[<a href="https://wg21.link/over.oper.general">over.oper.general</a>]</td>
     <td>CD7</td>
     <td>Restrictions on operator functions that are explicit object member functions</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2932">
     <td><a href="https://cplusplus.github.io/CWG/issues/2932.html">2932</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>review</td>
     <td>Value range of empty enumeration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2933">
     <td><a href="https://cplusplus.github.io/CWG/issues/2933.html">2933</a></td>
+    <td>[<a href="https://wg21.link/expr.type">expr.type</a>]</td>
     <td>CD7</td>
     <td>Dangling references</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2934">
     <td><a href="https://cplusplus.github.io/CWG/issues/2934.html">2934</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Unclear semantics of exception escaping from <TT>unhandled_exception</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2935">
     <td><a href="https://cplusplus.github.io/CWG/issues/2935.html">2935</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Destroying the coroutine state when initial-await-resume-called is false</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2936">
     <td><a href="https://cplusplus.github.io/CWG/issues/2936.html">2936</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.type">temp.dep.type</a>]</td>
     <td>CD7</td>
     <td>Local classes of templated functions should be part of the current instantiation</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2937">
     <td><a href="https://cplusplus.github.io/CWG/issues/2937.html">2937</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>CD7</td>
     <td>Grammar for <I>preprocessing-file</I> has no normative effect</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2938">
     <td><a href="https://cplusplus.github.io/CWG/issues/2938.html">2938</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Inheriting linkage from a previous declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2939">
     <td><a href="https://cplusplus.github.io/CWG/issues/2939.html">2939</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>CD7</td>
     <td>Do not allow <TT>reinterpret_cast</TT> from prvalue to rvalue reference</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2940">
     <td><a href="https://cplusplus.github.io/CWG/issues/2940.html">2940</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>review</td>
     <td>Definition of "object"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2941">
     <td><a href="https://cplusplus.github.io/CWG/issues/2941.html">2941</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension for function-style cast to reference type</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2942">
     <td><a href="https://cplusplus.github.io/CWG/issues/2942.html">2942</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct">dcl.fct</a>]</td>
     <td>open</td>
     <td>Packs in a function's parameter-type-list</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2943">
     <td><a href="https://cplusplus.github.io/CWG/issues/2943.html">2943</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.nodiscard">dcl.attr.nodiscard</a>]</td>
     <td>CD7</td>
     <td>Discarding a void return value</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="2944">
     <td><a href="https://cplusplus.github.io/CWG/issues/2944.html">2944</a></td>
+    <td>[<a href="https://wg21.link/expr.throw">expr.throw</a>]</td>
     <td>CD7</td>
     <td>Unsequenced <I>throw-expression</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2945">
     <td><a href="https://cplusplus.github.io/CWG/issues/2945.html">2945</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Redundant constraints on matching function template declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2946">
     <td><a href="https://cplusplus.github.io/CWG/issues/2946.html">2946</a></td>
+    <td>[<a href="https://wg21.link/temp.over.link">temp.over.link</a>]</td>
     <td>open</td>
     <td>Dependent call equivalence in non-ADL cases</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2947">
     <td><a href="https://cplusplus.github.io/CWG/issues/2947.html">2947</a></td>
+    <td>[<a href="https://wg21.link/cpp.module">cpp.module</a>]</td>
     <td>open</td>
     <td>Limiting macro expansion in <I>pp-module</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2948">
     <td><a href="https://cplusplus.github.io/CWG/issues/2948.html">2948</a></td>
+    <td>[<a href="https://wg21.link/temp.spec.partial.general">temp.spec.partial.general</a>]</td>
     <td>open</td>
     <td>Late ambiguity for partial template specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2949">
     <td><a href="https://cplusplus.github.io/CWG/issues/2949.html">2949</a></td>
+    <td>[<a href="https://wg21.link/temp.func.order">temp.func.order</a>]</td>
     <td>open</td>
     <td>Treatment of ellipsis during partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2950">
     <td><a href="https://cplusplus.github.io/CWG/issues/2950.html">2950</a></td>
+    <td>[<a href="https://wg21.link/class.bit">class.bit</a>]</td>
     <td>open</td>
     <td>Value preservation in enumeration vs. integer bit-fields</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2951">
     <td><a href="https://cplusplus.github.io/CWG/issues/2951.html">2951</a></td>
+    <td>[<a href="https://wg21.link/temp.decls.general">temp.decls.general</a>]</td>
     <td>open</td>
     <td>Distinguishing a primary template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2952">
     <td><a href="https://cplusplus.github.io/CWG/issues/2952.html">2952</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Vacuous initialization for subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2953">
     <td><a href="https://cplusplus.github.io/CWG/issues/2953.html">2953</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>open</td>
     <td>Value representation for non-trivially-copyable types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2954">
     <td><a href="https://cplusplus.github.io/CWG/issues/2954.html">2954</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>NAD</td>
     <td>Simultaneous modifications of an atomic object</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2955">
     <td><a href="https://cplusplus.github.io/CWG/issues/2955.html">2955</a></td>
+    <td>[<a href="https://wg21.link/intro.execution">intro.execution</a>]</td>
     <td>open</td>
     <td>Unify rules about conflicting unordered accesses</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2956">
     <td><a href="https://cplusplus.github.io/CWG/issues/2956.html">2956</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.qual.general">basic.lookup.qual.general</a>]</td>
     <td>open</td>
     <td>Missing allowance for pseudo-destructors in qualified lookup</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2957">
     <td><a href="https://cplusplus.github.io/CWG/issues/2957.html">2957</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Evaluating a reference member should constitute access</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2958">
     <td><a href="https://cplusplus.github.io/CWG/issues/2958.html">2958</a></td>
+    <td>[<a href="https://wg21.link/over.ics.rank">over.ics.rank</a>]</td>
     <td>open</td>
     <td>Overload resolution involving lvalue transformation and qualification conversion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2959">
     <td><a href="https://cplusplus.github.io/CWG/issues/2959.html">2959</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>open</td>
     <td>Naming enumerators in class member access expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2960">
     <td><a href="https://cplusplus.github.io/CWG/issues/2960.html">2960</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Introduce discontiguous object lifetime</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2961">
     <td><a href="https://cplusplus.github.io/CWG/issues/2961.html">2961</a></td>
+    <td>[<a href="https://wg21.link/temp.constr">temp.constr</a>]</td>
     <td>open</td>
     <td>Checking of ill-formed types in <I>constraint-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2962">
     <td><a href="https://cplusplus.github.io/CWG/issues/2962.html">2962</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Evaluation of destructor call for variable with constant destruction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2963">
     <td><a href="https://cplusplus.github.io/CWG/issues/2963.html">2963</a></td>
+    <td>[<a href="https://wg21.link/stmt.ambig">stmt.ambig</a>]</td>
     <td>open</td>
     <td>Paradoxical variable-or-function declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2964">
     <td><a href="https://cplusplus.github.io/CWG/issues/2964.html">2964</a></td>
+    <td>[<a href="https://wg21.link/conv.lval">conv.lval</a>]</td>
     <td>open</td>
     <td>Reading "invalid pointer values"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2965">
     <td><a href="https://cplusplus.github.io/CWG/issues/2965.html">2965</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.temp">basic.scope.temp</a>]</td>
     <td>open</td>
     <td>Generic lambdas do not have a template parameter scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2966">
     <td><a href="https://cplusplus.github.io/CWG/issues/2966.html">2966</a></td>
+    <td>[<a href="https://wg21.link/basic.fundamental">basic.fundamental</a>]</td>
     <td>open</td>
     <td>Alignment and value representation of <TT>std::nullptr_t</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2967">
     <td><a href="https://cplusplus.github.io/CWG/issues/2967.html">2967</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>open</td>
     <td>Explicit conversion functions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2968">
     <td><a href="https://cplusplus.github.io/CWG/issues/2968.html">2968</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>open</td>
     <td>Name lookup result for <I>typedef-name</I> vs. <I>class-name</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2969">
     <td><a href="https://cplusplus.github.io/CWG/issues/2969.html">2969</a></td>
+    <td>[<a href="https://wg21.link/basic.scope">basic.scope</a>]</td>
     <td>open</td>
     <td>Scopes in the <I>function-try-block</I> of a constructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2970">
     <td><a href="https://cplusplus.github.io/CWG/issues/2970.html">2970</a></td>
+    <td>[<a href="https://wg21.link/intro.races">intro.races</a>]</td>
     <td>CD7</td>
     <td>Races with <TT>volatile sig_atomic_t</TT> bit-fields</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2971">
     <td><a href="https://cplusplus.github.io/CWG/issues/2971.html">2971</a></td>
+    <td>[<a href="https://wg21.link/module.global.frag">module.global.frag</a>]</td>
     <td>open</td>
     <td>Specializations for a class are not decl-reachable</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2972">
     <td><a href="https://cplusplus.github.io/CWG/issues/2972.html">2972</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>open</td>
     <td>Declarative <I>nested-name-specifier</I> naming a partial specialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2973">
     <td><a href="https://cplusplus.github.io/CWG/issues/2973.html">2973</a></td>
+    <td>[<a href="https://wg21.link/dcl.typedef">dcl.typedef</a>]</td>
     <td>open</td>
     <td>Does an <I>alias-declaration</I> introduce a name for linkage purposes?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2974">
     <td><a href="https://cplusplus.github.io/CWG/issues/2974.html">2974</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.type">temp.deduct.type</a>]</td>
     <td>open</td>
     <td>Non-deduced context for <I>qualified-id</I> naming a template</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2975">
     <td><a href="https://cplusplus.github.io/CWG/issues/2975.html">2975</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.normal">temp.constr.normal</a>]</td>
     <td>open</td>
     <td>Effect of concept <I>template-head</I> on parameter mappings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2976">
     <td><a href="https://cplusplus.github.io/CWG/issues/2976.html">2976</a></td>
+    <td>[<a href="https://wg21.link/stmt.dcl">stmt.dcl</a>]</td>
     <td>review</td>
     <td>Transferring control out of a function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2977">
     <td><a href="https://cplusplus.github.io/CWG/issues/2977.html">2977</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>review</td>
     <td>Initialization with string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2978">
     <td><a href="https://cplusplus.github.io/CWG/issues/2978.html">2978</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.call">temp.deduct.call</a>]</td>
     <td>open</td>
     <td>Deduction involving reference to similar types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2979">
     <td><a href="https://cplusplus.github.io/CWG/issues/2979.html">2979</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Duplicate declarations of enumerations in class scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2980">
     <td><a href="https://cplusplus.github.io/CWG/issues/2980.html">2980</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>open</td>
     <td>Constraints on template template parameters</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2981">
     <td><a href="https://cplusplus.github.io/CWG/issues/2981.html">2981</a></td>
+    <td>[<a href="https://wg21.link/expr.arith.conv">expr.arith.conv</a>]</td>
     <td>open</td>
     <td>Usual arithmetic conversions and result types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2982">
     <td><a href="https://cplusplus.github.io/CWG/issues/2982.html">2982</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.decl">temp.deduct.decl</a>]</td>
     <td>CD7</td>
     <td>Deduction in <I>type-constraint</I>s</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2983">
     <td><a href="https://cplusplus.github.io/CWG/issues/2983.html">2983</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>review</td>
     <td>Non-type template parameters are not variables</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2984">
     <td><a href="https://cplusplus.github.io/CWG/issues/2984.html">2984</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependent structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2985">
     <td><a href="https://cplusplus.github.io/CWG/issues/2985.html">2985</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.ref">dcl.init.ref</a>]</td>
     <td>CD7</td>
     <td>Unclear rules for reference initialization with conversion</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2986">
     <td><a href="https://cplusplus.github.io/CWG/issues/2986.html">2986</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Creating objects within a mutable member of a const object</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2987">
     <td><a href="https://cplusplus.github.io/CWG/issues/2987.html">2987</a></td>
+    <td>[<a href="https://wg21.link/expr.static.cast">expr.static.cast</a>]</td>
     <td>CD7</td>
     <td>Remove dilapidated wording from <TT>static_cast</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2988">
     <td><a href="https://cplusplus.github.io/CWG/issues/2988.html">2988</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>open</td>
     <td>Is a closure type from a <I>lambda-expression</I> appearing in a <I>concept-definition</I> a TU-local entity?</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2989">
     <td><a href="https://cplusplus.github.io/CWG/issues/2989.html">2989</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.paren">expr.prim.paren</a>]</td>
     <td>open</td>
     <td>Remove misleading general allowance for parentheses</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="2990">
     <td><a href="https://cplusplus.github.io/CWG/issues/2990.html">2990</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>CD7</td>
     <td>Exporting redeclarations of namespaces</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="2991">
     <td><a href="https://cplusplus.github.io/CWG/issues/2991.html">2991</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>open</td>
     <td>"array size" is vague</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2992">
     <td><a href="https://cplusplus.github.io/CWG/issues/2992.html">2992</a></td>
+    <td>[<a href="https://wg21.link/basic.pre">basic.pre</a>]</td>
     <td>open</td>
     <td>Labels do not have names</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2993">
     <td><a href="https://cplusplus.github.io/CWG/issues/2993.html">2993</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.general">dcl.fct.def.general</a>]</td>
     <td>open</td>
     <td>Body of a destructor</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2994">
     <td><a href="https://cplusplus.github.io/CWG/issues/2994.html">2994</a></td>
+    <td>[<a href="https://wg21.link/temp.param">temp.param</a>]</td>
     <td>open</td>
     <td>Allowing template parameters following template parameter packs that are pack expansions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2995">
     <td><a href="https://cplusplus.github.io/CWG/issues/2995.html">2995</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Meaning of flowing off the end of a function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2996">
     <td><a href="https://cplusplus.github.io/CWG/issues/2996.html">2996</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.atomic">temp.constr.atomic</a>]</td>
     <td>open</td>
     <td>Impenetrable definition of atomic constraint</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2997">
     <td><a href="https://cplusplus.github.io/CWG/issues/2997.html">2997</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.default">dcl.fct.def.default</a>]</td>
     <td>open</td>
     <td>Defaulted functions with deleted definition</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2998">
     <td><a href="https://cplusplus.github.io/CWG/issues/2998.html">2998</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.partial">temp.deduct.partial</a>]</td>
     <td>open</td>
     <td>Missing deduction consistency check for partial ordering</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="2999">
     <td><a href="https://cplusplus.github.io/CWG/issues/2999.html">2999</a></td>
+    <td>[<a href="https://wg21.link/class.default.ctor">class.default.ctor</a>]</td>
     <td>open</td>
     <td>Trivial unions changing existing behavior</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3000">
     <td><a href="https://cplusplus.github.io/CWG/issues/3000.html">3000</a></td>
+    <td>[<a href="https://wg21.link/expr.cond">expr.cond</a>]</td>
     <td>review</td>
     <td>Handling of cv-qualified class types in conditional operator</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3001">
     <td><a href="https://cplusplus.github.io/CWG/issues/3001.html">3001</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>tentatively ready</td>
     <td>Inconsistent restrictions for <TT>static_cast</TT> on pointers to out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3002">
     <td><a href="https://cplusplus.github.io/CWG/issues/3002.html">3002</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.temp">temp.dep.temp</a>]</td>
     <td>tentatively ready</td>
     <td>Template parameter/argument confusion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3003">
     <td><a href="https://cplusplus.github.io/CWG/issues/3003.html">3003</a></td>
+    <td>[<a href="https://wg21.link/dcl.type.simple">dcl.type.simple</a>]</td>
     <td>review</td>
     <td>Naming a deducible template for class template argument deduction</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3004">
     <td><a href="https://cplusplus.github.io/CWG/issues/3004.html">3004</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>tentatively ready</td>
     <td>Pointer arithmetic on array of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3005">
     <td><a href="https://cplusplus.github.io/CWG/issues/3005.html">3005</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>tentatively ready</td>
     <td>Function parameters should never be name-independent</td>
     <td align="center">
@@ -17902,570 +20864,665 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
   </tr>
   <tr class="open" id="3006">
     <td><a href="https://cplusplus.github.io/CWG/issues/3006.html">3006</a></td>
+    <td>[<a href="https://wg21.link/temp.explicit">temp.explicit</a>]</td>
     <td>review</td>
     <td>Vague restrictions for explicit instantiations of class templates</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3007">
     <td><a href="https://cplusplus.github.io/CWG/issues/3007.html">3007</a></td>
+    <td>[<a href="https://wg21.link/class.compare.default">class.compare.default</a>]</td>
     <td>open</td>
     <td>Access checking during synthesis of defaulted comparison operator, take 2</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3008">
     <td><a href="https://cplusplus.github.io/CWG/issues/3008.html">3008</a></td>
+    <td>[<a href="https://wg21.link/diff.dcl">diff.dcl</a>]</td>
     <td>tentatively ready</td>
     <td>Missing Annex C entry for <TT>void</TT> object declarations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3009">
     <td><a href="https://cplusplus.github.io/CWG/issues/3009.html">3009</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>Unclear rules for constant initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3010">
     <td><a href="https://cplusplus.github.io/CWG/issues/3010.html">3010</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td>constexpr placement-new should require transparent replaceability</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3011">
     <td><a href="https://cplusplus.github.io/CWG/issues/3011.html">3011</a></td>
+    <td>[<a href="https://wg21.link/expr.new">expr.new</a>]</td>
     <td>tentatively ready</td>
     <td>Parenthesized aggregate initialization for <I>new-expression</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3012">
     <td><a href="https://cplusplus.github.io/CWG/issues/3012.html">3012</a></td>
+    <td>[<a href="https://wg21.link/dcl.constexpr">dcl.constexpr</a>]</td>
     <td>open</td>
     <td>Deviating <TT>constexpr</TT> or <TT>consteval</TT> across translation units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3013">
     <td><a href="https://cplusplus.github.io/CWG/issues/3013.html">3013</a></td>
+    <td>[<a href="https://wg21.link/cpp.embed.gen">cpp.embed.gen</a>]</td>
     <td>CD7</td>
     <td>Disallowing macros for <TT>#embed</TT> parameters</td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3014">
     <td><a href="https://cplusplus.github.io/CWG/issues/3014.html">3014</a></td>
+    <td>[<a href="https://wg21.link/cpp.embed.gen">cpp.embed.gen</a>]</td>
     <td>CD7</td>
     <td>Comma-delimited vs. comma-separated output for <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3015">
     <td><a href="https://cplusplus.github.io/CWG/issues/3015.html">3015</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>CD7</td>
     <td>Handling of <I>header-name</I>s for <TT>#include</TT> and <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr id="3016">
     <td><a href="https://cplusplus.github.io/CWG/issues/3016.html">3016</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Satisfying the syntactic requirements of <TT>#include</TT> and <TT>#embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3017">
     <td><a href="https://cplusplus.github.io/CWG/issues/3017.html">3017</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>open</td>
     <td>Commas in controlling expression of conditional inclusion</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3018">
     <td><a href="https://cplusplus.github.io/CWG/issues/3018.html">3018</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Validity of <TT>defined</TT> in <TT>__has_embed</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3019">
     <td><a href="https://cplusplus.github.io/CWG/issues/3019.html">3019</a></td>
+    <td>[<a href="https://wg21.link/lex.header">lex.header</a>]</td>
     <td>open</td>
     <td>Restrictions on character sequences in <I>header-name</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr id="3020">
     <td><a href="https://cplusplus.github.io/CWG/issues/3020.html">3020</a></td>
+    <td>[<a href="https://wg21.link/cpp.cond">cpp.cond</a>]</td>
     <td>CD7</td>
     <td>Missing specification for <TT>__has_cpp_attribute(indeterminate)</TT></td>
     <td class="unknown" align="center">Unknown</td>
   </tr>
   <tr class="open" id="3021">
     <td><a href="https://cplusplus.github.io/CWG/issues/3021.html">3021</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.order">temp.constr.order</a>]</td>
     <td>drafting</td>
     <td>Subsumption rules for fold expanded constraints</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3022">
     <td><a href="https://cplusplus.github.io/CWG/issues/3022.html">3022</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>review</td>
     <td>Redundant specification of explicit destructor calls</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3023">
     <td><a href="https://cplusplus.github.io/CWG/issues/3023.html">3023</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.list">dcl.init.list</a>]</td>
     <td>open</td>
     <td>Default arguments in list-initialization</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3024">
     <td><a href="https://cplusplus.github.io/CWG/issues/3024.html">3024</a></td>
+    <td>[<a href="https://wg21.link/dcl.align">dcl.align</a>]</td>
     <td>open</td>
     <td>Alignment of references</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3025">
     <td><a href="https://cplusplus.github.io/CWG/issues/3025.html">3025</a></td>
+    <td>[<a href="https://wg21.link/basic.stc.dynamic.deallocation">basic.stc.dynamic.deallocation</a>]</td>
     <td>open</td>
     <td>Deallocation functions returning void</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3026">
     <td><a href="https://cplusplus.github.io/CWG/issues/3026.html">3026</a></td>
+    <td>[<a href="https://wg21.link/expr.unary.op">expr.unary.op</a>]</td>
     <td>open</td>
     <td>Class for pointer-to-member formation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3027">
     <td><a href="https://cplusplus.github.io/CWG/issues/3027.html">3027</a></td>
+    <td>[<a href="https://wg21.link/temp.type">temp.type</a>]</td>
     <td>open</td>
     <td>Equivalence of <I>pack-index-specifier</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3028">
     <td><a href="https://cplusplus.github.io/CWG/issues/3028.html">3028</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td>A <I>using-declarator</I> should bind a name</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3029">
     <td><a href="https://cplusplus.github.io/CWG/issues/3029.html">3029</a></td>
+    <td>[<a href="https://wg21.link/basic.align">basic.align</a>]</td>
     <td>drafting</td>
     <td>Confusing note about ordinary character types for aligned memory areas</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3030">
     <td><a href="https://cplusplus.github.io/CWG/issues/3030.html">3030</a></td>
+    <td>[<a href="https://wg21.link/dcl.array">dcl.array</a>]</td>
     <td>open</td>
     <td>Initializing array prvalues of unknown bound</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3031">
     <td><a href="https://cplusplus.github.io/CWG/issues/3031.html">3031</a></td>
+    <td>[<a href="https://wg21.link/over.match.funcs.general">over.match.funcs.general</a>]</td>
     <td>open</td>
     <td>Finding declarations for conversion operators for access checking</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3032">
     <td><a href="https://cplusplus.github.io/CWG/issues/3032.html">3032</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.general">temp.arg.general</a>]</td>
     <td>tentatively ready</td>
     <td>Template argument disambiguation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3033">
     <td><a href="https://cplusplus.github.io/CWG/issues/3033.html">3033</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.namespace">basic.scope.namespace</a>]</td>
     <td>open</td>
     <td>Scope after <I>declarator-id</I> before determining correspondence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3034">
     <td><a href="https://cplusplus.github.io/CWG/issues/3034.html">3034</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Infinite recursion should hit an implementation limit</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3035">
     <td><a href="https://cplusplus.github.io/CWG/issues/3035.html">3035</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Lambda expressions in anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3036">
     <td><a href="https://cplusplus.github.io/CWG/issues/3036.html">3036</a></td>
+    <td>[<a href="https://wg21.link/basic.extended.fp">basic.extended.fp</a>]</td>
     <td>open</td>
     <td>Extended floating-point types should not be cv-qualified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3037">
     <td><a href="https://cplusplus.github.io/CWG/issues/3037.html">3037</a></td>
+    <td>[<a href="https://wg21.link/namespace.udecl">namespace.udecl</a>]</td>
     <td>open</td>
     <td>Name lookup results for <I>using-declarator</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3038">
     <td><a href="https://cplusplus.github.io/CWG/issues/3038.html">3038</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.grammar">dcl.attr.grammar</a>]</td>
     <td>open</td>
     <td>Ignorability of attributes, again</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3039">
     <td><a href="https://cplusplus.github.io/CWG/issues/3039.html">3039</a></td>
+    <td>[<a href="https://wg21.link/intro.object">intro.object</a>]</td>
     <td>open</td>
     <td>Undefined behavior from implicit object creation ignores observable checkpoints</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3040">
     <td><a href="https://cplusplus.github.io/CWG/issues/3040.html">3040</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.def.coroutine">dcl.fct.def.coroutine</a>]</td>
     <td>open</td>
     <td>Mishandling of lambda coroutines</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3041">
     <td><a href="https://cplusplus.github.io/CWG/issues/3041.html">3041</a></td>
+    <td>[<a href="https://wg21.link/class.dtor">class.dtor</a>]</td>
     <td>open</td>
     <td>Overly aggressive rule for deleting the destructor of a union</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3042">
     <td><a href="https://cplusplus.github.io/CWG/issues/3042.html">3042</a></td>
+    <td>[<a href="https://wg21.link/basic.lval">basic.lval</a>]</td>
     <td>open</td>
     <td>Implicit object creation is insufficient to model effective type rule of C</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3043">
     <td><a href="https://cplusplus.github.io/CWG/issues/3043.html">3043</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension for temporaries in expansion statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3044">
     <td><a href="https://cplusplus.github.io/CWG/issues/3044.html">3044</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Iterating expansion statements woes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3045">
     <td><a href="https://cplusplus.github.io/CWG/issues/3045.html">3045</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.block">basic.scope.block</a>]</td>
     <td>tentatively ready</td>
     <td>Regularizing environment interactions of expansion statement</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3046">
     <td><a href="https://cplusplus.github.io/CWG/issues/3046.html">3046</a></td>
+    <td>[<a href="https://wg21.link/dcl.enum">dcl.enum</a>]</td>
     <td>open</td>
     <td>Enumerations as part of the common initial sequence</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3047">
     <td><a href="https://cplusplus.github.io/CWG/issues/3047.html">3047</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Calling destructors on out-of-lifetime objects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3048">
     <td><a href="https://cplusplus.github.io/CWG/issues/3048.html">3048</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Empty destructuring expansion statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3049">
     <td><a href="https://cplusplus.github.io/CWG/issues/3049.html">3049</a></td>
+    <td>[<a href="https://wg21.link/class.prop">class.prop</a>]</td>
     <td>open</td>
     <td>Implicitly deleted move operation should not disable trivial relocation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3050">
     <td><a href="https://cplusplus.github.io/CWG/issues/3050.html">3050</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.deprecated">dcl.attr.deprecated</a>]</td>
     <td>open</td>
     <td>[[deprecated]] for class template partial specializations</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3051">
     <td><a href="https://cplusplus.github.io/CWG/issues/3051.html">3051</a></td>
+    <td>[<a href="https://wg21.link/class.mem.general">class.mem.general</a>]</td>
     <td>open</td>
     <td>Missing specification for types of member subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3052">
     <td><a href="https://cplusplus.github.io/CWG/issues/3052.html">3052</a></td>
+    <td>[<a href="https://wg21.link/stmt.return">stmt.return</a>]</td>
     <td>open</td>
     <td>Unclear handling of checks on discarded <TT>return</TT> statements</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3053">
     <td><a href="https://cplusplus.github.io/CWG/issues/3053.html">3053</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>tentatively ready</td>
     <td>Allowing <TT>#undef likely</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3054">
     <td><a href="https://cplusplus.github.io/CWG/issues/3054.html">3054</a></td>
+    <td>[<a href="https://wg21.link/expr.call">expr.call</a>]</td>
     <td>open</td>
     <td>Use of default arguments depending on shape of <I>postfix-expression</I> in a function call</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3055">
     <td><a href="https://cplusplus.github.io/CWG/issues/3055.html">3055</a></td>
+    <td>[<a href="https://wg21.link/over.call.object">over.call.object</a>]</td>
     <td>open</td>
     <td>Misleading body for surrogate call function</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3056">
     <td><a href="https://cplusplus.github.io/CWG/issues/3056.html">3056</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.req.type">expr.prim.req.type</a>]</td>
     <td>open</td>
     <td>Missing semicolons in grammar for <I>type-requirement</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3057">
     <td><a href="https://cplusplus.github.io/CWG/issues/3057.html">3057</a></td>
+    <td>[<a href="https://wg21.link/over.ics.ref">over.ics.ref</a>]</td>
     <td>open</td>
     <td>Ranking of derived-to-base conversions should ignore reference binding</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3058">
     <td><a href="https://cplusplus.github.io/CWG/issues/3058.html">3058</a></td>
+    <td>[<a href="https://wg21.link/basic.lookup.general">basic.lookup.general</a>]</td>
     <td>open</td>
     <td>"Program point" is not defined</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3059">
     <td><a href="https://cplusplus.github.io/CWG/issues/3059.html">3059</a></td>
+    <td>[<a href="https://wg21.link/expr.const">expr.const</a>]</td>
     <td>open</td>
     <td><TT>throw;</TT> in constant expressions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3060">
     <td><a href="https://cplusplus.github.io/CWG/issues/3060.html">3060</a></td>
+    <td>[<a href="https://wg21.link/basic.start.main">basic.start.main</a>]</td>
     <td>open</td>
     <td>Change in behavior for <TT>noexcept</TT> <TT>main</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3061">
     <td><a href="https://cplusplus.github.io/CWG/issues/3061.html">3061</a></td>
+    <td>[<a href="https://wg21.link/stmt.expand">stmt.expand</a>]</td>
     <td>tentatively ready</td>
     <td>Trailing comma in an <I>expansion-init-list</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3062">
     <td><a href="https://cplusplus.github.io/CWG/issues/3062.html">3062</a></td>
+    <td>[<a href="https://wg21.link/dcl.fct.default">dcl.fct.default</a>]</td>
     <td>open</td>
     <td>Overlapping specification of default template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3063">
     <td><a href="https://cplusplus.github.io/CWG/issues/3063.html">3063</a></td>
+    <td>[<a href="https://wg21.link/class.temporary">class.temporary</a>]</td>
     <td>open</td>
     <td>Lifetime extension of temporaries past function return</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3064">
     <td><a href="https://cplusplus.github.io/CWG/issues/3064.html">3064</a></td>
+    <td>[<a href="https://wg21.link/basic.life">basic.life</a>]</td>
     <td>open</td>
     <td>Mishandling of placement-new in lifetime rules</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3065">
     <td><a href="https://cplusplus.github.io/CWG/issues/3065.html">3065</a></td>
+    <td>[<a href="https://wg21.link/basic.types.general">basic.types.general</a>]</td>
     <td>open</td>
     <td>Reachability and completeness of types</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3066">
     <td><a href="https://cplusplus.github.io/CWG/issues/3066.html">3066</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.id.qual">expr.prim.id.qual</a>]</td>
     <td>tentatively ready</td>
     <td>Declarative <I>nested-name-specifier</I> in explicit instantiation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3067">
     <td><a href="https://cplusplus.github.io/CWG/issues/3067.html">3067</a></td>
+    <td>[<a href="https://wg21.link/conv.array">conv.array</a>]</td>
     <td>open</td>
     <td>Array-to-pointer conversion with object type mismatch</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3068">
     <td><a href="https://cplusplus.github.io/CWG/issues/3068.html">3068</a></td>
+    <td>[<a href="https://wg21.link/class.access.general">class.access.general</a>]</td>
     <td>open</td>
     <td>Access checking in friends involving <I>qualified-id</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3069">
     <td><a href="https://cplusplus.github.io/CWG/issues/3069.html">3069</a></td>
+    <td>[<a href="https://wg21.link/temp.constr.normal">temp.constr.normal</a>]</td>
     <td>open</td>
     <td>Reference to wrong placeholder</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3070">
     <td><a href="https://cplusplus.github.io/CWG/issues/3070.html">3070</a></td>
+    <td>[<a href="https://wg21.link/class.copy.assign">class.copy.assign</a>]</td>
     <td>open</td>
     <td>Trivial assignment can skip member subobjects</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3071">
     <td><a href="https://cplusplus.github.io/CWG/issues/3071.html">3071</a></td>
+    <td>[<a href="https://wg21.link/dcl.struct.bind">dcl.struct.bind</a>]</td>
     <td>open</td>
     <td>Negative <TT>tuple_size</TT> in structured bindings</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3072">
     <td><a href="https://cplusplus.github.io/CWG/issues/3072.html">3072</a></td>
+    <td>[<a href="https://wg21.link/temp.deduct.general">temp.deduct.general</a>]</td>
     <td>open</td>
     <td>Incorrect examples for lambda SFINAE</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3073">
     <td><a href="https://cplusplus.github.io/CWG/issues/3073.html">3073</a></td>
+    <td>[<a href="https://wg21.link/over.match.ref">over.match.ref</a>]</td>
     <td>open</td>
     <td>Dependence of <I>R</I> on <TT>T2</TT> is unclear</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3074">
     <td><a href="https://cplusplus.github.io/CWG/issues/3074.html">3074</a></td>
+    <td>[<a href="https://wg21.link/cpp.module">cpp.module</a>]</td>
     <td>tentatively ready</td>
     <td>Redundant ill-formedness for module macros</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3075">
     <td><a href="https://cplusplus.github.io/CWG/issues/3075.html">3075</a></td>
+    <td>[<a href="https://wg21.link/cpp.import">cpp.import</a>]</td>
     <td>tentatively ready</td>
     <td>Unclear matching of import directive</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3076">
     <td><a href="https://cplusplus.github.io/CWG/issues/3076.html">3076</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>tentatively ready</td>
     <td>Remove unnecessary IFNDR for malformed <I>header-name-token</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3077">
     <td><a href="https://cplusplus.github.io/CWG/issues/3077.html">3077</a></td>
+    <td>[<a href="https://wg21.link/cpp.pre">cpp.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Undesirable formation of <TT>import</TT> directive with <I>string-literal</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3078">
     <td><a href="https://cplusplus.github.io/CWG/issues/3078.html">3078</a></td>
+    <td>[<a href="https://wg21.link/cpp.include">cpp.include</a>]</td>
     <td>review</td>
     <td>Different treatment of <TT>#include</TT> <I>pp-tokens</I> and <I>header-name-tokens</I></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3079">
     <td><a href="https://cplusplus.github.io/CWG/issues/3079.html">3079</a></td>
+    <td>[<a href="https://wg21.link/class.union.anon">class.union.anon</a>]</td>
     <td>open</td>
     <td>Allow <I>empty-declaration</I>s in anonymous unions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3080">
     <td><a href="https://cplusplus.github.io/CWG/issues/3080.html">3080</a></td>
+    <td>[<a href="https://wg21.link/temp.arg.template">temp.arg.template</a>]</td>
     <td>tentatively ready</td>
     <td>Clarify kinds of permitted template template arguments</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3081">
     <td><a href="https://cplusplus.github.io/CWG/issues/3081.html">3081</a></td>
+    <td>[<a href="https://wg21.link/expr.ref">expr.ref</a>]</td>
     <td>review</td>
     <td>Require glvalue when splicing direct base class relationship</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3082">
     <td><a href="https://cplusplus.github.io/CWG/issues/3082.html">3082</a></td>
+    <td>[<a href="https://wg21.link/expr.reinterpret.cast">expr.reinterpret.cast</a>]</td>
     <td>tentatively ready</td>
     <td>Allow for call-compatible function types in <TT>reinterpret_cast</TT></td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3083">
     <td><a href="https://cplusplus.github.io/CWG/issues/3083.html">3083</a></td>
+    <td>[<a href="https://wg21.link/stmt.pre">stmt.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Remove redundant restrictions on class and enum definitions</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3084">
     <td><a href="https://cplusplus.github.io/CWG/issues/3084.html">3084</a></td>
+    <td>[<a href="https://wg21.link/stmt.cont">stmt.cont</a>]</td>
     <td>tentatively ready</td>
     <td><I>compound-statement</I>s inside <I>iteration-statement</I>s</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3085">
     <td><a href="https://cplusplus.github.io/CWG/issues/3085.html">3085</a></td>
+    <td>[<a href="https://wg21.link/stmt.pre">stmt.pre</a>]</td>
     <td>tentatively ready</td>
     <td>Apply restriction inside for-range-declaration</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3086">
     <td><a href="https://cplusplus.github.io/CWG/issues/3086.html">3086</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>tentatively ready</td>
     <td>Destringizing should consider all sorts of encoding-prefixes</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3087">
     <td><a href="https://cplusplus.github.io/CWG/issues/3087.html">3087</a></td>
+    <td>[<a href="https://wg21.link/cpp.pragma.op">cpp.pragma.op</a>]</td>
     <td>open</td>
     <td>Destringizing for raw string literals</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3088">
     <td><a href="https://cplusplus.github.io/CWG/issues/3088.html">3088</a></td>
+    <td>[<a href="https://wg21.link/cpp.replace.general">cpp.replace.general</a>]</td>
     <td>open</td>
     <td>Clarify macro treatment of identifiers with special meaning</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3089">
     <td><a href="https://cplusplus.github.io/CWG/issues/3089.html">3089</a></td>
+    <td>[<a href="https://wg21.link/dcl.init.general">dcl.init.general</a>]</td>
     <td>tentatively ready</td>
     <td>const-default-constructible improperly handles std::meta::info</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3090">
     <td><a href="https://cplusplus.github.io/CWG/issues/3090.html">3090</a></td>
+    <td>[<a href="https://wg21.link/module.interface">module.interface</a>]</td>
     <td>tentatively ready</td>
     <td>Internal linkage from header units</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3091">
     <td><a href="https://cplusplus.github.io/CWG/issues/3091.html">3091</a></td>
+    <td>[<a href="https://wg21.link/basic.link">basic.link</a>]</td>
     <td>review</td>
     <td>Linking of translation units as sequences of tokens</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3092">
     <td><a href="https://cplusplus.github.io/CWG/issues/3092.html">3092</a></td>
+    <td>[<a href="https://wg21.link/dcl.attr.annotation">dcl.attr.annotation</a>]</td>
     <td>tentatively ready</td>
     <td><I>base-specifier</I>s are not "declared"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3093">
     <td><a href="https://cplusplus.github.io/CWG/issues/3093.html">3093</a></td>
+    <td>[<a href="https://wg21.link/expr.prim.splice">expr.prim.splice</a>]</td>
     <td>open</td>
     <td>Missing integration of direct base class relationships</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3094">
     <td><a href="https://cplusplus.github.io/CWG/issues/3094.html">3094</a></td>
+    <td>[<a href="https://wg21.link/lex.phases">lex.phases</a>]</td>
     <td>review</td>
     <td>Rework phases for string literal concatenation and token formation</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3095">
     <td><a href="https://cplusplus.github.io/CWG/issues/3095.html">3095</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.expr">temp.dep.expr</a>]</td>
     <td>open</td>
     <td>Type-dependent packs that are not structured binding packs</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3096">
     <td><a href="https://cplusplus.github.io/CWG/issues/3096.html">3096</a></td>
+    <td>[<a href="https://wg21.link/temp.dep.constexpr">temp.dep.constexpr</a>]</td>
     <td>open</td>
     <td>Value-dependence of size of structured binding pack with non-dependent initializer</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3097">
     <td><a href="https://cplusplus.github.io/CWG/issues/3097.html">3097</a></td>
+    <td>[<a href="https://wg21.link/basic.scope.scope">basic.scope.scope</a>]</td>
     <td>tentatively ready</td>
     <td>Lambda expression introduces a scope</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3098">
     <td><a href="https://cplusplus.github.io/CWG/issues/3098.html">3098</a></td>
+    <td>[<a href="https://wg21.link/temp.names">temp.names</a>]</td>
     <td>tentatively ready</td>
     <td>Remove redundancy "names or designates"</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3099">
     <td><a href="https://cplusplus.github.io/CWG/issues/3099.html">3099</a></td>
+    <td>[<a href="https://wg21.link/temp.inst">temp.inst</a>]</td>
     <td>open</td>
     <td>Instantiation of type aliases from alias templates is unspecified</td>
     <td align="center">Not resolved</td>
   </tr>
   <tr class="open" id="3100">
     <td><a href="https://cplusplus.github.io/CWG/issues/3100.html">3100</a></td>
+    <td>[<a href="https://wg21.link/basic.start.term">basic.start.term</a>]</td>
     <td>open</td>
     <td>Destruction order for objects with static storage duration</td>
     <td align="center">Not resolved</td>
diff --git a/clang/www/make_cxx_dr_status b/clang/www/make_cxx_dr_status
index 485a9a56267ca..3ba12e13a7354 100755
--- a/clang/www/make_cxx_dr_status
+++ b/clang/www/make_cxx_dr_status
@@ -10,26 +10,36 @@ output = os.path.join(clang_www_dir, 'cxx_dr_status.html')
 dr_test_dir = os.path.join(clang_www_dir, '../test/CXX/drs')
 
 class DR:
-  def __init__(self, section, issue, url, status, title):
-    self.section, self.issue, self.url, self.status, self.title = \
-        section, issue, url, status, title
+  def __init__(self, *, section_number, section_name, section_link, number, url, status, liaison, title):
+    self.section_number, self.section_name, self.section_link, self.number, self.url, self.status, self.liaison, self.title = \
+        section_number, section_name, section_link, number, url, status, liaison, title
   def __repr__(self):
-    return '%s (%s): %s' % (self.issue, self.status, self.title)
-
-def parse(dr):
-  try:
-    section, issue_link, status, liaison, title = [
-        col.split('>', 1)[1].split('</TD>')[0]
-        for col in dr.split('</TR>', 1)[0].split('<TD')[1:]
-    ]
-  except Exception as ex:
-    print(f"Parse error: {ex}\n{dr}", file=sys.stderr)
-    sys.exit(1)
-  _, url, issue = issue_link.split('"', 2)
-  url = url.strip()
-  issue = int(issue.split('>', 1)[1].split('<', 1)[0])
-  title = title.replace('<issue_title>', '').replace('</issue_title>', '').replace('\r\n', '\n').strip()
-  return DR(section, issue, url, status, title)
+    return '%s (%s): %s' % (self.number, self.status, self.title)
+
+  pattern = re.compile('''
+<TD.*>(?P<section_number>.*) <A href="(?P<section_link>.*)">(?P<section_name>.*)</A>
+</TD>
+<TD.*><A HREF="(?P<url>.*)">(?P<number>.*)</A></TD>
+<TD.*>(?P<status>.*)</TD>
+<TD.*>(?P<liaison>.*)</TD>
+<TD.*><issue_title>(?P<title>[\\w\\W]*)</issue_title></TD>
+</TR>''')
+
+  @classmethod
+  def parse_from_html(cls, html_string):
+    match = cls.pattern.match(html_string)
+    if match is None:
+        print(f"Parse error: {html_string}", file=sys.stderr)
+        exit(1)
+    return cls(
+      section_number=match.group('section_number'),
+      section_name=match.group('section_name'),
+      section_link=match.group('section_link'),
+      number=int(match.group('number')),
+      url=match.group('url'),
+      status=match.group('status'),
+      liaison=match.group('liaison'),
+      title=match.group('title').replace('\n', ' ').strip())
 
 def collect_tests():
   status_re = re.compile(r'\bcwg([0-9]+): (.*)')
@@ -68,8 +78,8 @@ def get_issues(path):
      print(ex, file=sys.stderr)
      sys.exit(1)
 
-  return sorted((parse(dr) for dr in buffer.split('<TR>')[2:]),
-                key = lambda dr: dr.issue)
+  return sorted((DR.parse_from_html(dr) for dr in buffer.split('<TR>')[2:]),
+                key = lambda dr: dr.number)
 
 
 issue_list_path  = None
@@ -127,9 +137,10 @@ out_html.append('''\
 
 <p>This page tracks which C++ defect reports are implemented within Clang.</p>
 
-<table width="689" border="1" cellspacing="0">
+<table width="892" border="1" cellspacing="0">
   <tr>
     <th>Number</th>
+    <th>Section</th>
     <th>Status</th>
     <th>Issue title</th>
     <th>Available in Clang?</th>
@@ -149,7 +160,7 @@ def availability(issue):
     unresolved_status = unresolved_status_match.group(1)
     proposed_resolution_match = re.search(r' (open|drafting|review|tentatively ready|ready) (\d{4}-\d{2}(?:-\d{2})?|P\d{4}R\d+)$', status)
     if proposed_resolution_match is None:
-      raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.issue, unresolved_status))
+      raise AvailabilityError('error: issue {}: \'{}\' status should be followed by a paper number (P1234R5) or proposed resolution in YYYY-MM-DD format'.format(dr.number, unresolved_status))
     proposed_resolution = proposed_resolution_match.group(2)
     status = status[:-1-len(proposed_resolution)]
     status = status[:-1-len(unresolved_status)]
@@ -236,7 +247,7 @@ def availability(issue):
     avail = 'Duplicate of <a href="#%s">%s</a>' % (dup, dup)
     _, avail_style, _, _ = availability(dup)
   else:
-    raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.issue))
+    raise AvailabilityError('error: unknown status %s for issue %s' % (status, dr.number))
   return (avail + avail_suffix, avail_style, unresolved_status, details)
 
 count = {}
@@ -254,7 +265,7 @@ for dr in drs:
   elif dr.status in ('open', 'drafting', 'review', 'tentatively ready', 'ready'):
     row_style = ' class="open"'
     try:
-      avail, avail_style, unresolved_status, details = availability(dr.issue)
+      avail, avail_style, unresolved_status, details = availability(dr.number)
     except AvailabilityError as e:
       availability_error_occurred = True
       print(e.args[0])
@@ -267,12 +278,12 @@ for dr in drs:
       if unresolved_status != dr.status:
         availability_error_occurred = True
         print("error: issue %s is marked '%s', which differs from CWG index status '%s'" \
-             % (dr.issue, unresolved_status, dr.status))
+             % (dr.number, unresolved_status, dr.status))
         continue
   else:
     row_style = ''
     try:
-      avail, avail_style, unresolved_status, details = availability(dr.issue)
+      avail, avail_style, unresolved_status, details = availability(dr.number)
     except AvailabilityError as e:
       availability_error_occurred = True
       print(e.args[0])
@@ -281,7 +292,7 @@ for dr in drs:
     if unresolved_status:
       availability_error_occurred = True
       print("error: issue %s is marked '%s', even though it is resolved in CWG index" \
-           % (dr.issue, unresolved_status))
+           % (dr.number, unresolved_status))
       continue
 
   if not avail.startswith('Sup') and not avail.startswith('Dup'):
@@ -297,8 +308,9 @@ for dr in drs:
         {details}
       </details>'''
   out_html.append(f'''
-  <tr{row_style} id="{dr.issue}">
-    <td><a href="https://cplusplus.github.io/CWG/issues/{dr.issue}.html">{dr.issue}</a></td>
+  <tr{row_style} id="{dr.number}">
+    <td><a href="https://cplusplus.github.io/CWG/issues/{dr.number}.html">{dr.number}</a></td>
+    <td>[<a href="{dr.section_link}">{dr.section_name}</a>]</td>
     <td>{dr.status}</td>
     <td>{dr.title}</td>
     <td{avail_style} align="center">{avail}</td>
diff --git a/compiler-rt/lib/asan/scripts/asan_symbolize.py b/compiler-rt/lib/asan/scripts/asan_symbolize.py
index 8ecd66c745119..091e9bcc9a796 100755
--- a/compiler-rt/lib/asan/scripts/asan_symbolize.py
+++ b/compiler-rt/lib/asan/scripts/asan_symbolize.py
@@ -59,6 +59,7 @@ def is_valid_arch(s):
         "armv7s",
         "armv7k",
         "arm64",
+        "arm64e",
         "powerpc64",
         "powerpc64le",
         "s390x",
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index c21b2bad1d212..45b7055abf454 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -21,7 +21,9 @@
 
 #if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
 
+#if __STDC_HOSTED__
 #include <assert.h>
+#endif // __STDC_HOSTED__
 
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(_MSC_VER)
 #include <cpuid.h>
@@ -245,8 +247,8 @@ struct __processor_model {
   unsigned int __cpu_features[1];
 } __cpu_model = {0, 0, 0, {0}};
 
-static_assert(sizeof(__cpu_model) == 16,
-              "Wrong size of __cpu_model will result in ABI break");
+_Static_assert(sizeof(__cpu_model) == 16,
+               "Wrong size of __cpu_model will result in ABI break");
 
 // This code is copied from lib/Support/Host.cpp.
 // Changes to either file should be mirrored in the other.
@@ -1200,8 +1202,8 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
   unsigned Vendor;
   unsigned Model, Family;
   unsigned Features[(CPU_FEATURE_MAX + 31) / 32] = {0};
-  static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
-  static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");
+  _Static_assert(sizeof(Features) / sizeof(Features[0]) == 4, "");
+  _Static_assert(sizeof(__cpu_features2) / sizeof(__cpu_features2[0]) == 3, "");
 
   // This function needs to run just once.
   if (__cpu_model.__cpu_vendor)
@@ -1234,9 +1236,11 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
   } else
     __cpu_model.__cpu_vendor = VENDOR_OTHER;
 
+#if __STDC_HOSTED__
   assert(__cpu_model.__cpu_vendor < VENDOR_MAX);
   assert(__cpu_model.__cpu_type < CPU_TYPE_MAX);
   assert(__cpu_model.__cpu_subtype < CPU_SUBTYPE_MAX);
+#endif // __STDC_HOSTED__
 
   return 0;
 }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index ba85a0eb5a35e..b515b15b327d8 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -737,6 +737,7 @@ enum ModuleArch {
   kModuleArchARMV7S,
   kModuleArchARMV7K,
   kModuleArchARM64,
+  kModuleArchARM64E,
   kModuleArchLoongArch64,
   kModuleArchRISCV64,
   kModuleArchHexagon
@@ -810,6 +811,8 @@ inline const char *ModuleArchToString(ModuleArch arch) {
       return "armv7k";
     case kModuleArchARM64:
       return "arm64";
+    case kModuleArchARM64E:
+      return "arm64e";
     case kModuleArchLoongArch64:
       return "loongarch64";
     case kModuleArchRISCV64:
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index b0a29db908639..90c0b66f81b5b 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -960,7 +960,17 @@ static void DisableMmapExcGuardExceptions() {
       RTLD_DEFAULT, "task_set_exc_guard_behavior");
   if (set_behavior == nullptr) return;
   const task_exc_guard_behavior_t task_exc_guard_none = 0;
-  set_behavior(mach_task_self(), task_exc_guard_none);
+  kern_return_t res = set_behavior(mach_task_self(), task_exc_guard_none);
+  if (res != KERN_SUCCESS) {
+    Report(
+        "WARN: task_set_exc_guard_behavior returned %d (%s), "
+        "mmap may fail unexpectedly.\n",
+        res, mach_error_string(res));
+    if (res == KERN_DENIED)
+      Report(
+          "HINT: Check that task_set_exc_guard_behavior is allowed by "
+          "sandbox.\n");
+  }
 }
 
 static void VerifyInterceptorsWorking();
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
index a9533d6fc04ca..a5ec85ae16460 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_mac.cpp
@@ -20,18 +20,21 @@
 #include <mach/mach.h>
 
 // These are not available in older macOS SDKs.
-#ifndef CPU_SUBTYPE_X86_64_H
-#define CPU_SUBTYPE_X86_64_H  ((cpu_subtype_t)8)   /* Haswell */
-#endif
-#ifndef CPU_SUBTYPE_ARM_V7S
-#define CPU_SUBTYPE_ARM_V7S   ((cpu_subtype_t)11)  /* Swift */
-#endif
-#ifndef CPU_SUBTYPE_ARM_V7K
-#define CPU_SUBTYPE_ARM_V7K   ((cpu_subtype_t)12)
-#endif
-#ifndef CPU_TYPE_ARM64
-#define CPU_TYPE_ARM64        (CPU_TYPE_ARM | CPU_ARCH_ABI64)
-#endif
+#  ifndef CPU_SUBTYPE_X86_64_H
+#    define CPU_SUBTYPE_X86_64_H ((cpu_subtype_t)8) /* Haswell */
+#  endif
+#  ifndef CPU_SUBTYPE_ARM_V7S
+#    define CPU_SUBTYPE_ARM_V7S ((cpu_subtype_t)11) /* Swift */
+#  endif
+#  ifndef CPU_SUBTYPE_ARM_V7K
+#    define CPU_SUBTYPE_ARM_V7K ((cpu_subtype_t)12)
+#  endif
+#  ifndef CPU_TYPE_ARM64
+#    define CPU_TYPE_ARM64 (CPU_TYPE_ARM | CPU_ARCH_ABI64)
+#  endif
+#  ifndef CPU_SUBTYPE_ARM64E
+#    define CPU_SUBTYPE_ARM64E ((cpu_subtype_t)2)
+#  endif
 
 namespace __sanitizer {
 
@@ -311,18 +314,26 @@ ModuleArch ModuleArchFromCpuType(cpu_type_t cputype, cpu_subtype_t cpusubtype) {
     case CPU_TYPE_I386:
       return kModuleArchI386;
     case CPU_TYPE_X86_64:
-      if (cpusubtype == CPU_SUBTYPE_X86_64_ALL) return kModuleArchX86_64;
-      if (cpusubtype == CPU_SUBTYPE_X86_64_H) return kModuleArchX86_64H;
+      if (cpusubtype == CPU_SUBTYPE_X86_64_ALL)
+        return kModuleArchX86_64;
+      if (cpusubtype == CPU_SUBTYPE_X86_64_H)
+        return kModuleArchX86_64H;
       CHECK(0 && "Invalid subtype of x86_64");
       return kModuleArchUnknown;
     case CPU_TYPE_ARM:
-      if (cpusubtype == CPU_SUBTYPE_ARM_V6) return kModuleArchARMV6;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7) return kModuleArchARMV7;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7S) return kModuleArchARMV7S;
-      if (cpusubtype == CPU_SUBTYPE_ARM_V7K) return kModuleArchARMV7K;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V6)
+        return kModuleArchARMV6;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7)
+        return kModuleArchARMV7;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7S)
+        return kModuleArchARMV7S;
+      if (cpusubtype == CPU_SUBTYPE_ARM_V7K)
+        return kModuleArchARMV7K;
       CHECK(0 && "Invalid subtype of ARM");
       return kModuleArchUnknown;
     case CPU_TYPE_ARM64:
+      if (cpusubtype == CPU_SUBTYPE_ARM64E)
+        return kModuleArchARM64E;
       return kModuleArchARM64;
     default:
       CHECK(0 && "Invalid CPU type");
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index f8d821e125b7a..7eb0c9756d64a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -505,6 +505,13 @@ static void ChooseSymbolizerTools(IntrusiveList<SymbolizerTool> *list,
   }
 
 #  if SANITIZER_APPLE
+  if (list->empty()) {
+    Report(
+        "WARN: No external symbolizers found. Symbols may be missing or "
+        "unreliable.\n");
+    Report(
+        "HINT: Is PATH set? Does sandbox allow file-read of /usr/bin/atos?\n");
+  }
   VReport(2, "Using dladdr symbolizer.\n");
   list->push_back(new (*allocator) DlAddrSymbolizer());
 #  endif  // SANITIZER_APPLE
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
index 00542b944f516..c18e5bd9f3194 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_procmaps_test.cpp
@@ -70,7 +70,7 @@ TEST(MemoryMapping, LoadedModuleArchAndUUID) {
         EXPECT_EQ(arch, kModuleArchI386);
       } else if (SANITIZER_WORDSIZE == 64) {
         EXPECT_TRUE(arch == kModuleArchX86_64 || arch == kModuleArchX86_64H ||
-                    arch == kModuleArchARM64);
+                    arch == kModuleArchARM64 || arch == kModuleArchARM64E);
       }
       const u8 *uuid = modules[i].uuid();
       u8 null_uuid[kModuleUUIDSize] = {0};
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.def b/compiler-rt/lib/scudo/standalone/allocator_config.def
index 748530820cd64..0aea7b8f2fb9a 100644
--- a/compiler-rt/lib/scudo/standalone/allocator_config.def
+++ b/compiler-rt/lib/scudo/standalone/allocator_config.def
@@ -57,6 +57,10 @@ BASE_OPTIONAL(const bool, MaySupportMemoryTagging, false)
 // Disable the quarantine code.
 BASE_OPTIONAL(const bool, QuarantineDisabled, false)
 
+// If set to true, malloc_usable_size returns the exact size of the allocation.
+// If set to false, return the total available size in the allocation.
+BASE_OPTIONAL(const bool, ExactUsableSize, true)
+
 // PRIMARY_REQUIRED_TYPE(NAME)
 //
 // SizeClassMap to use with the Primary.
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 329ec4596482b..ffe9554203241 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -706,19 +706,26 @@ class Allocator {
         if (!getChunkFromBlock(Block, &Chunk, &Header) &&
             !getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
           return;
-      } else {
-        if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header))
-          return;
+      } else if (!getChunkFromBlock(addHeaderTag(Block), &Chunk, &Header)) {
+        return;
       }
-      if (Header.State == Chunk::State::Allocated) {
-        uptr TaggedChunk = Chunk;
-        if (allocatorSupportsMemoryTagging<AllocatorConfig>())
-          TaggedChunk = untagPointer(TaggedChunk);
-        if (useMemoryTagging<AllocatorConfig>(Primary.Options.load()))
-          TaggedChunk = loadTag(Chunk);
-        Callback(TaggedChunk, getSize(reinterpret_cast<void *>(Chunk), &Header),
-                 Arg);
+
+      if (Header.State != Chunk::State::Allocated)
+        return;
+
+      uptr TaggedChunk = Chunk;
+      if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+        TaggedChunk = untagPointer(TaggedChunk);
+      uptr Size;
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load()))) {
+        TaggedChunk = loadTag(Chunk);
+        Size = getSize(reinterpret_cast<void *>(Chunk), &Header);
+      } else if (AllocatorConfig::getExactUsableSize()) {
+        Size = getSize(reinterpret_cast<void *>(Chunk), &Header);
+      } else {
+        Size = getUsableSize(reinterpret_cast<void *>(Chunk), &Header);
       }
+      Callback(TaggedChunk, Size, Arg);
     };
     Primary.iterateOverBlocks(Lambda);
     Secondary.iterateOverBlocks(Lambda);
@@ -759,16 +766,50 @@ class Allocator {
     return false;
   }
 
-  // Return the usable size for a given chunk. Technically we lie, as we just
-  // report the actual size of a chunk. This is done to counteract code actively
-  // writing past the end of a chunk (like sqlite3) when the usable size allows
-  // for it, which then forces realloc to copy the usable size of a chunk as
-  // opposed to its actual size.
+  ALWAYS_INLINE uptr getUsableSize(const void *Ptr,
+                                   Chunk::UnpackedHeader *Header) {
+    void *BlockBegin = getBlockBegin(Ptr, Header);
+    if (LIKELY(Header->ClassId)) {
+      return SizeClassMap::getSizeByClassId(Header->ClassId) -
+             (reinterpret_cast<uptr>(Ptr) - reinterpret_cast<uptr>(BlockBegin));
+    }
+
+    uptr UntaggedPtr = reinterpret_cast<uptr>(Ptr);
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>()) {
+      UntaggedPtr = untagPointer(UntaggedPtr);
+      BlockBegin = untagPointer(BlockBegin);
+    }
+    return SecondaryT::getBlockEnd(BlockBegin) - UntaggedPtr;
+  }
+
+  // Return the usable size for a given chunk. If MTE is enabled or if the
+  // ExactUsableSize config parameter is true, we report the exact size of
+  // the original allocation size. Otherwise, we will return the total
+  // actual usable size.
   uptr getUsableSize(const void *Ptr) {
     if (UNLIKELY(!Ptr))
       return 0;
 
-    return getAllocSize(Ptr);
+    if (AllocatorConfig::getExactUsableSize() ||
+        UNLIKELY(useMemoryTagging<AllocatorConfig>(Primary.Options.load())))
+      return getAllocSize(Ptr);
+
+    initThreadMaybe();
+
+#ifdef GWP_ASAN_HOOKS
+    if (UNLIKELY(GuardedAlloc.pointerIsMine(Ptr)))
+      return GuardedAlloc.getSize(Ptr);
+#endif // GWP_ASAN_HOOKS
+
+    Ptr = getHeaderTaggedPointer(const_cast<void *>(Ptr));
+    Chunk::UnpackedHeader Header;
+    Chunk::loadHeader(Cookie, Ptr, &Header);
+
+    // Getting the alloc size of a chunk only makes sense if it's allocated.
+    if (UNLIKELY(Header.State != Chunk::State::Allocated))
+      reportInvalidChunkState(AllocatorAction::Sizing, Ptr);
+
+    return getUsableSize(Ptr, &Header);
   }
 
   uptr getAllocSize(const void *Ptr) {
@@ -951,6 +992,19 @@ class Allocator {
                          MemorySize, 2, 16);
   }
 
+  uptr getBlockBeginTestOnly(const void *Ptr) {
+    Chunk::UnpackedHeader Header;
+    Chunk::loadHeader(Cookie, Ptr, &Header);
+    DCHECK(Header.State == Chunk::State::Allocated);
+
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+      Ptr = untagPointer(const_cast<void *>(Ptr));
+    void *Begin = getBlockBegin(Ptr, &Header);
+    if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+      Begin = untagPointer(Begin);
+    return reinterpret_cast<uptr>(Begin);
+  }
+
 private:
   typedef typename PrimaryT::SizeClassMap SizeClassMap;
 
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 5fdfd1e7c55cc..4837ac96b9b26 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -1152,6 +1152,248 @@ TEST(ScudoCombinedTest, QuarantineDisabled) {
   EXPECT_EQ(Stats.find("Stats: Quarantine"), std::string::npos);
 }
 
+struct UsableSizeClassConfig {
+  static const scudo::uptr NumBits = 1;
+  static const scudo::uptr MinSizeLog = 10;
+  static const scudo::uptr MidSizeLog = 10;
+  static const scudo::uptr MaxSizeLog = 13;
+  static const scudo::u16 MaxNumCachedHint = 8;
+  static const scudo::uptr MaxBytesCachedLog = 12;
+  static const scudo::uptr SizeDelta = 0;
+};
+
+struct TestExactUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = false;
+  static const bool QuarantineDisabled = true;
+
+  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 1U, 1U>;
+
+  struct Primary {
+    // In order to properly test the usable size, this Primary config has
+    // four real size classes: 1024, 2048, 4096, 8192.
+    using SizeClassMap = scudo::FixedSizeClassMap<UsableSizeClassConfig>;
+    static const scudo::uptr RegionSizeLog = 21U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 18;
+  };
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+};
+
+template <class AllocatorT> void VerifyExactUsableSize(AllocatorT &Allocator) {
+  // Scan through all sizes up to 10000 then some larger sizes.
+  for (scudo::uptr Size = 1; Size < 10000; Size++) {
+    void *P = Allocator.allocate(Size, Origin);
+    EXPECT_EQ(Size, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << Size;
+    Allocator.deallocate(P, Origin);
+  }
+
+  // Verify that aligned allocations also return the exact size allocated.
+  const scudo::uptr AllocSize = 313;
+  for (scudo::uptr Align = 1; Align <= 8; Align++) {
+    void *P = Allocator.allocate(AllocSize, Origin, 1U << Align);
+    EXPECT_EQ(AllocSize, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << AllocSize << " at align "
+        << 1 << Align;
+    Allocator.deallocate(P, Origin);
+  }
+
+  // Verify an explicitly large allocations.
+  const scudo::uptr LargeAllocSize = 1000000;
+  void *P = Allocator.allocate(LargeAllocSize, Origin);
+  EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P));
+  Allocator.deallocate(P, Origin);
+
+  // Now do it for aligned allocations for large allocations.
+  for (scudo::uptr Align = 1; Align <= 8; Align++) {
+    void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Align);
+    EXPECT_EQ(LargeAllocSize, Allocator.getUsableSize(P))
+        << "Failed usable size at allocation size " << AllocSize << " at align "
+        << 1 << Align;
+    Allocator.deallocate(P, Origin);
+  }
+}
+
+template <class AllocatorT>
+void VerifyIterateOverUsableSize(AllocatorT &Allocator) {
+  // This will not verify if the size is the exact size or the size of the
+  // size class. Instead verify that the size matches the usable size and
+  // assume the other tests have verified getUsableSize.
+  std::unordered_map<void *, size_t> Pointers;
+  Pointers.insert({Allocator.allocate(128, Origin), 0U});
+  Pointers.insert({Allocator.allocate(128, Origin, 32), 0U});
+  Pointers.insert({Allocator.allocate(2000, Origin), 0U});
+  Pointers.insert({Allocator.allocate(2000, Origin, 64), 0U});
+  Pointers.insert({Allocator.allocate(8000, Origin), 0U});
+  Pointers.insert({Allocator.allocate(8000, Origin, 128), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin, 128), 0U});
+  Pointers.insert({Allocator.allocate(2000205, Origin, 256), 0U});
+
+  Allocator.disable();
+  Allocator.iterateOverChunks(
+      0, static_cast<scudo::uptr>(SCUDO_MMAP_RANGE_SIZE - 1),
+      [](uintptr_t Base, size_t Size, void *Arg) {
+        std::unordered_map<void *, size_t> *Pointers =
+            reinterpret_cast<std::unordered_map<void *, size_t> *>(Arg);
+        (*Pointers)[reinterpret_cast<void *>(Base)] = Size;
+      },
+      reinterpret_cast<void *>(&Pointers));
+  Allocator.enable();
+
+  for (auto [Ptr, IterateSize] : Pointers) {
+    EXPECT_NE(0U, IterateSize)
+        << "Pointer " << Ptr << " not found in iterateOverChunks call.";
+    EXPECT_EQ(IterateSize, Allocator.getUsableSize(Ptr))
+        << "Pointer " << Ptr
+        << " mismatch between iterate size and usable size.";
+    Allocator.deallocate(Ptr, Origin);
+  }
+}
+
+TEST(ScudoCombinedTest, ExactUsableSize) {
+  using AllocatorT = scudo::Allocator<TestExactUsableSizeConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+struct TestExactUsableSizeMTEConfig : TestExactUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = true;
+};
+
+TEST(ScudoCombinedTest, ExactUsableSizeMTE) {
+  if (!scudo::archSupportsMemoryTagging() ||
+      !scudo::systemDetectsMemoryTagFaultsTestOnly())
+    TEST_SKIP("Only supported on systems that can enable MTE.");
+
+  scudo::enableSystemMemoryTaggingTestOnly();
+
+  using AllocatorT = scudo::Allocator<TestExactUsableSizeMTEConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+template <class AllocatorT>
+void VerifyUsableSizePrimary(AllocatorT &Allocator) {
+  std::vector<scudo::uptr> SizeClasses = {1024U, 2048U, 4096U, 8192U};
+  for (size_t I = 0; I < SizeClasses.size(); I++) {
+    scudo::uptr SizeClass = SizeClasses[I];
+    scudo::uptr StartSize;
+    if (I == 0)
+      StartSize = 1;
+    else
+      StartSize = SizeClasses[I - 1];
+    scudo::uptr UsableSize = SizeClass - scudo::Chunk::getHeaderSize();
+    for (scudo::uptr Size = StartSize; Size < UsableSize; Size++) {
+      void *P = Allocator.allocate(Size, Origin);
+      EXPECT_EQ(UsableSize, Allocator.getUsableSize(P))
+          << "Failed usable size at allocation size " << Size
+          << " for size class " << SizeClass;
+      memset(P, 0xff, UsableSize);
+      EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass,
+                reinterpret_cast<scudo::uptr>(P) + UsableSize);
+      Allocator.deallocate(P, Origin);
+    }
+
+    StartSize = UsableSize + 1;
+  }
+
+  std::vector<scudo::uptr> Alignments = {32U, 128U};
+  for (size_t I = 0; I < SizeClasses.size(); I++) {
+    scudo::uptr SizeClass = SizeClasses[I];
+    scudo::uptr AllocSize;
+    if (I == 0)
+      AllocSize = 1;
+    else
+      AllocSize = SizeClasses[I - 1] + 1;
+
+    for (auto Alignment : Alignments) {
+      void *P = Allocator.allocate(AllocSize, Origin, Alignment);
+      scudo::uptr UsableSize = Allocator.getUsableSize(P);
+      memset(P, 0xff, UsableSize);
+      EXPECT_EQ(Allocator.getBlockBeginTestOnly(P) + SizeClass,
+                reinterpret_cast<scudo::uptr>(P) + UsableSize)
+          << "Failed usable size at allocation size " << AllocSize
+          << " for size class " << SizeClass << " at alignment " << Alignment;
+      Allocator.deallocate(P, Origin);
+    }
+  }
+}
+
+template <class AllocatorT>
+void VerifyUsableSizeSecondary(AllocatorT &Allocator) {
+  const scudo::uptr LargeAllocSize = 996780;
+  const scudo::uptr PageSize = scudo::getPageSizeCached();
+  void *P = Allocator.allocate(LargeAllocSize, Origin);
+  scudo::uptr UsableSize = Allocator.getUsableSize(P);
+  memset(P, 0xff, UsableSize);
+  // Assumes that the secondary always rounds up allocations to a page boundary.
+  EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize,
+                           PageSize),
+            reinterpret_cast<scudo::uptr>(P) + UsableSize);
+  Allocator.deallocate(P, Origin);
+
+  // Check aligned allocations now.
+  for (scudo::uptr Alignment = 1; Alignment <= 8; Alignment++) {
+    void *P = Allocator.allocate(LargeAllocSize, Origin, 1U << Alignment);
+    scudo::uptr UsableSize = Allocator.getUsableSize(P);
+    EXPECT_EQ(scudo::roundUp(reinterpret_cast<scudo::uptr>(P) + LargeAllocSize,
+                             PageSize),
+              reinterpret_cast<scudo::uptr>(P) + UsableSize)
+        << "Failed usable size at allocation size " << LargeAllocSize
+        << " at alignment " << Alignment;
+    Allocator.deallocate(P, Origin);
+  }
+}
+
+struct TestFullUsableSizeConfig : TestExactUsableSizeConfig {
+  static const bool ExactUsableSize = false;
+};
+
+TEST(ScudoCombinedTest, FullUsableSize) {
+  using AllocatorT = scudo::Allocator<TestFullUsableSizeConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  VerifyUsableSizePrimary<AllocatorT>(*Allocator);
+  VerifyUsableSizeSecondary<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
+
+struct TestFullUsableSizeMTEConfig : TestFullUsableSizeConfig {
+  static const bool MaySupportMemoryTagging = true;
+};
+
+TEST(ScudoCombinedTest, FullUsableSizeMTE) {
+  if (!scudo::archSupportsMemoryTagging() ||
+      !scudo::systemDetectsMemoryTagFaultsTestOnly())
+    TEST_SKIP("Only supported on systems that can enable MTE.");
+
+  scudo::enableSystemMemoryTaggingTestOnly();
+
+  using AllocatorT = scudo::Allocator<TestFullUsableSizeMTEConfig>;
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  // When MTE is enabled, you get exact sizes.
+  VerifyExactUsableSize<AllocatorT>(*Allocator);
+  VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
+}
 // Verify that no special quarantine blocks appear in iterateOverChunks.
 TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
   using AllocatorT = TestAllocator<TestQuarantineConfig>;
diff --git a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
index 612317b3c3293..9e5d0658e5ed5 100644
--- a/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/wrappers_c_test.cpp
@@ -588,8 +588,13 @@ TEST_F(ScudoWrappersCTest, MallocInfo) {
   EXPECT_EQ(errno, 0);
   fclose(F);
   EXPECT_EQ(strncmp(Buffer, "<malloc version=\"scudo-", 23), 0);
-  EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"1234\" count=\""));
-  EXPECT_NE(nullptr, strstr(Buffer, "<alloc size=\"4321\" count=\""));
+  std::string expected;
+  expected =
+      "<alloc size=\"" + std::to_string(malloc_usable_size(P1)) + "\" count=\"";
+  EXPECT_NE(nullptr, strstr(Buffer, expected.c_str()));
+  expected =
+      "<alloc size=\"" + std::to_string(malloc_usable_size(P2)) + "\" count=\"";
+  EXPECT_NE(nullptr, strstr(Buffer, expected.c_str()));
 
   free(P1);
   free(P2);
diff --git a/compiler-rt/test/asan/TestCases/strcmp.c b/compiler-rt/test/asan/TestCases/strcmp.c
index 417bd491ebe02..2b31e64768c42 100644
--- a/compiler-rt/test/asan/TestCases/strcmp.c
+++ b/compiler-rt/test/asan/TestCases/strcmp.c
@@ -14,6 +14,8 @@ int main(int argc, char **argv) {
   assert(strcmp(s1 - 1, s2));
 
   // CHECK: {{.*ERROR: AddressSanitizer: stack-buffer-underflow on address}}
-  // CHECK: READ of size 1
+  // Very rarely `s1[-1]` happens to be '1', resulting in `strcmp` needing to
+  // check 2 bytes before failing, rather than 1 - this should still pass
+  // CHECK: READ of size {{[12]}}
   return 0;
 }
diff --git a/compiler-rt/test/tsan/cxa_guard_acquire.cpp b/compiler-rt/test/tsan/cxa_guard_acquire.cpp
index fc407259e8968..6050c243cb8c1 100644
--- a/compiler-rt/test/tsan/cxa_guard_acquire.cpp
+++ b/compiler-rt/test/tsan/cxa_guard_acquire.cpp
@@ -66,10 +66,17 @@ int main(int argc, char **argv) {
   printf("Enter main\n");
 
   // If initialization is contended, the blocked thread should enter a
-  // potentially blocking region.
+  // potentially blocking region. Note that we use a DAG check because it is
+  // possible for Thread 1 to acquire the guard, then Thread 2 fail to acquire
+  // the guard then call `OnPotentiallyBlockingRegionBegin` and print "Enter
+  // potentially blocking region\n", before Thread 1 manages to reach "Enter
+  // constructor\n". This is exceptionally rare, but can be replicated by
+  // inserting a `sleep(1)` between `LazyInit() {` and `printf("Enter
+  // constructor\n");`. Due to the barrier it is not possible for the exit logs
+  // to be inverted.
   //
-  // CHECK-NEXT: Enter constructor
-  // CHECK-NEXT: Enter potentially blocking region
+  // CHECK-DAG: Enter constructor
+  // CHECK-DAG: Enter potentially blocking region
   // CHECK-NEXT: Exit constructor
   // CHECK-NEXT: Exit potentially blocking region
   barrier_init(&barrier, 2);
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index 225a6558ef956..ef58da61e371b 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -445,6 +445,7 @@ struct NodeVisitor {
   READ_FEATURE(ObjectDecl)
   READ_FEATURE(OldParameterStmt)
   READ_FEATURE(OmpAlignedClause)
+  READ_FEATURE(OmpAllocateDirective)
   READ_FEATURE(OmpBeginDirective)
   READ_FEATURE(OmpBeginLoopDirective)
   READ_FEATURE(OmpBeginSectionsDirective)
@@ -541,7 +542,6 @@ struct NodeVisitor {
   READ_FEATURE(OpenMPCancellationPointConstruct)
   READ_FEATURE(OpenMPConstruct)
   READ_FEATURE(OpenMPCriticalConstruct)
-  READ_FEATURE(OpenMPDeclarativeAllocate)
   READ_FEATURE(OpenMPDeclarativeConstruct)
   READ_FEATURE(OpenMPDeclareReductionConstruct)
   READ_FEATURE(OpenMPDeclareSimdConstruct)
@@ -550,7 +550,6 @@ struct NodeVisitor {
   READ_FEATURE(OmpAtomicDefaultMemOrderClause)
   READ_FEATURE(OpenMPFlushConstruct)
   READ_FEATURE(OpenMPLoopConstruct)
-  READ_FEATURE(OpenMPExecutableAllocate)
   READ_FEATURE(OpenMPAllocatorsConstruct)
   READ_FEATURE(OpenMPRequiresConstruct)
   READ_FEATURE(OpenMPSimpleStandaloneConstruct)
diff --git a/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
new file mode 100644
index 0000000000000..d735ce95a83dc
--- /dev/null
+++ b/flang/include/flang/Optimizer/Builder/CUDAIntrinsicCall.h
@@ -0,0 +1,95 @@
+//==-- Builder/CUDAIntrinsicCall.h - lowering of CUDA intrinsics ---*-C++-*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_LOWER_CUDAINTRINSICCALL_H
+#define FORTRAN_LOWER_CUDAINTRINSICCALL_H
+
+#include "flang/Optimizer/Builder/IntrinsicCall.h"
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
+
+namespace fir {
+
+struct CUDAIntrinsicLibrary : IntrinsicLibrary {
+
+  // Constructors.
+  explicit CUDAIntrinsicLibrary(fir::FirOpBuilder &builder, mlir::Location loc)
+      : IntrinsicLibrary(builder, loc) {}
+  CUDAIntrinsicLibrary() = delete;
+  CUDAIntrinsicLibrary(const CUDAIntrinsicLibrary &) = delete;
+
+  // CUDA intrinsic handlers.
+  mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicAddR2(mlir::Type,
+                                    llvm::ArrayRef<fir::ExtendedValue>);
+  template <int extent>
+  fir::ExtendedValue genAtomicAddVector(mlir::Type,
+                                        llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicCas(mlir::Type,
+                                  llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicExch(mlir::Type,
+                                   llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  fir::ExtendedValue genAtomicXor(mlir::Type,
+                                  llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
+  template <const char *fctName, int extent>
+  fir::ExtendedValue genLDXXFunc(mlir::Type,
+                                 llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  template <typename OpTy>
+  mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
+  mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
+  void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
+  void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
+  void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
+  template <mlir::NVVM::VoteSyncKind kind>
+  mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
+};
+
+const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name);
+
+} // namespace fir
+
+#endif // FORTRAN_LOWER_CUDAINTRINSICCALL_H
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 3407dd01dd504..01d27fd5fc399 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -19,7 +19,6 @@
 #include "flang/Runtime/iostat-consts.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include <optional>
 
@@ -187,20 +186,6 @@ struct IntrinsicLibrary {
   mlir::Value genAnint(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genAny(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicAdd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicCas(mlir::Type,
-                                  llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genAtomicDec(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicExch(mlir::Type,
-                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genAtomicInc(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicMax(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicMin(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genAtomicSub(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  fir::ExtendedValue genAtomicXor(mlir::Type,
-                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue
       genCommandArgumentCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAsind(mlir::Type, llvm::ArrayRef<mlir::Value>);
@@ -208,11 +193,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genAssociated(mlir::Type,
                                    llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genAtand(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierArrive(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierArriveCnt(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genBarrierInit(llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genBarrierTryWait(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genBarrierTryWaitSleep(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genBesselJn(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genBesselYn(mlir::Type,
@@ -234,9 +214,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genCount(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genCpuTime(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCshift(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
-  template <const char *fctName, int extent>
-  fir::ExtendedValue genCUDALDXXFunc(mlir::Type,
-                                     llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCFunPtr(mlir::Type,
                                            llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genCAssociatedCPtr(mlir::Type,
@@ -276,7 +253,6 @@ struct IntrinsicLibrary {
                                       llvm::ArrayRef<fir::ExtendedValue>);
   template <Extremum, ExtremumBehavior>
   mlir::Value genExtremum(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genFenceProxyAsync(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genFraction(mlir::Type resultType,
                           mlir::ArrayRef<mlir::Value> args);
@@ -368,8 +344,6 @@ struct IntrinsicLibrary {
   mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>);
   template <typename Shift>
   mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genMatchAnySync(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genMatmulTranspose(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
@@ -392,8 +366,6 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genNull(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genNumImages(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  template <typename OpTy>
-  mlir::Value genNVVMTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genPack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genParity(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genPerror(llvm::ArrayRef<fir::ExtendedValue>);
@@ -448,56 +420,25 @@ struct IntrinsicLibrary {
   fir::ExtendedValue genSum(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   void genSignalSubroutine(llvm::ArrayRef<fir::ExtendedValue>);
   void genSleep(llvm::ArrayRef<fir::ExtendedValue>);
-  void genSyncThreads(llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genSyncThreadsAnd(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genSyncThreadsCount(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genSyncThreadsOr(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genSyncWarp(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genSystem(std::optional<mlir::Type>,
                                mlir::ArrayRef<fir::ExtendedValue> args);
   void genSystemClock(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTand(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genTanpi(mlir::Type, llvm::ArrayRef<mlir::Value>);
   mlir::Value genTime(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genTMABulkCommitGroup(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadC4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadC8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadI4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadI8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR2(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkLoadR8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreI4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreI8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR2(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreR8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreC4(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkStoreC8(llvm::ArrayRef<fir::ExtendedValue>);
-  void genTMABulkWaitGroup(llvm::ArrayRef<fir::ExtendedValue>);
   mlir::Value genTrailz(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genTransfer(mlir::Type,
                                  llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genTranspose(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genThisGrid(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genThisImage(mlir::Type,
                                   llvm::ArrayRef<fir::ExtendedValue>);
-  mlir::Value genThisThreadBlock(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  mlir::Value genThisWarp(mlir::Type, llvm::ArrayRef<mlir::Value>);
-  void genThreadFence(llvm::ArrayRef<fir::ExtendedValue>);
-  void genThreadFenceBlock(llvm::ArrayRef<fir::ExtendedValue>);
-  void genThreadFenceSystem(llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genTrim(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genUbound(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genUnlink(std::optional<mlir::Type> resultType,
                                llvm::ArrayRef<fir::ExtendedValue> args);
   fir::ExtendedValue genUnpack(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genVerify(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
-  template <mlir::NVVM::VoteSyncKind kind>
-  mlir::Value genVoteSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
 
   /// Implement all conversion functions like DBLE, the first argument is
   /// the value to convert. There may be an additional KIND arguments that
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index a7398a4ef970f..de2716410d6cd 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -512,6 +512,7 @@ class ParseTreeDumper {
   NODE(parser, OmpAlignModifier)
   NODE(parser, OmpAllocateClause)
   NODE(OmpAllocateClause, Modifier)
+  NODE(parser, OmpAllocateDirective)
   NODE(parser, OmpAllocatorComplexModifier)
   NODE(parser, OmpAllocatorSimpleModifier)
   NODE(parser, OmpAlwaysModifier)
@@ -739,7 +740,6 @@ class ParseTreeDumper {
   NODE(parser, OpenMPCancellationPointConstruct)
   NODE(parser, OpenMPConstruct)
   NODE(parser, OpenMPCriticalConstruct)
-  NODE(parser, OpenMPDeclarativeAllocate)
   NODE(parser, OpenMPDeclarativeAssumes)
   NODE(parser, OpenMPDeclarativeConstruct)
   NODE(parser, OpenMPDeclareMapperConstruct)
@@ -748,7 +748,6 @@ class ParseTreeDumper {
   NODE(parser, OpenMPDeclareTargetConstruct)
   NODE(parser, OpenMPDepobjConstruct)
   NODE(parser, OpenMPDispatchConstruct)
-  NODE(parser, OpenMPExecutableAllocate)
   NODE(parser, OpenMPFlushConstruct)
   NODE(parser, OpenMPGroupprivate)
   NODE(parser, OpenMPLoopConstruct)
diff --git a/flang/include/flang/Parser/openmp-utils.h b/flang/include/flang/Parser/openmp-utils.h
index 49db091af93a7..8fa4a84aff06d 100644
--- a/flang/include/flang/Parser/openmp-utils.h
+++ b/flang/include/flang/Parser/openmp-utils.h
@@ -22,6 +22,7 @@
 #include <type_traits>
 #include <utility>
 #include <variant>
+#include <vector>
 
 namespace Fortran::parser::omp {
 
@@ -33,23 +34,6 @@ template <typename T> constexpr auto addr_if(const std::optional<T> &x) {
 }
 
 namespace detail {
-using D = llvm::omp::Directive;
-
-template <typename Construct> //
-struct ConstructId {
-  static constexpr llvm::omp::Directive id{D::OMPD_unknown};
-};
-
-#define MAKE_CONSTR_ID(Construct, Id) \
-  template <> struct ConstructId<Construct> { \
-    static constexpr llvm::omp::Directive id{Id}; \
-  }
-
-MAKE_CONSTR_ID(OpenMPDeclarativeAllocate, D::OMPD_allocate);
-MAKE_CONSTR_ID(OpenMPExecutableAllocate, D::OMPD_allocate);
-
-#undef MAKE_CONSTR_ID
-
 struct DirectiveNameScope {
   static OmpDirectiveName MakeName(CharBlock source = {},
       llvm::omp::Directive id = llvm::omp::Directive::OMPD_unknown) {
@@ -97,9 +81,6 @@ struct DirectiveNameScope {
     } else if constexpr (TupleTrait<T>) {
       if constexpr (std::is_base_of_v<OmpBlockConstruct, T>) {
         return std::get<OmpBeginDirective>(x.t).DirName();
-      } else if constexpr (std::is_same_v<T, OpenMPDeclarativeAllocate> ||
-          std::is_same_v<T, OpenMPExecutableAllocate>) {
-        return MakeName(std::get<Verbatim>(x.t).source, ConstructId<T>::id);
       } else {
         return GetFromTuple(
             x.t, std::make_index_sequence<std::tuple_size_v<decltype(x.t)>>{});
@@ -139,6 +120,9 @@ template <typename T> OmpDirectiveName GetOmpDirectiveName(const T &x) {
   return detail::DirectiveNameScope::GetOmpDirectiveName(x);
 }
 
+const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x);
+const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x);
+
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause);
 
 template <typename T>
@@ -158,6 +142,13 @@ const OmpCombinerExpression *GetCombinerExpr(
     const OmpReductionSpecifier &rspec);
 const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init);
 
+struct OmpAllocateInfo {
+  std::vector<const OmpAllocateDirective *> dirs;
+  const ExecutionPartConstruct *body{nullptr};
+};
+
+OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x);
+
 } // namespace Fortran::parser::omp
 
 #endif // FORTRAN_PARSER_OPENMP_UTILS_H
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 4dd5e84f60dfe..8c7578f7a1941 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -5151,17 +5151,42 @@ struct OpenMPThreadprivate {
   CharBlock source;
 };
 
-// 2.11.3 allocate -> ALLOCATE (variable-name-list) [clause]
-struct OpenMPDeclarativeAllocate {
-  TUPLE_CLASS_BOILERPLATE(OpenMPDeclarativeAllocate);
-  CharBlock source;
-  std::tuple<Verbatim, std::optional<OmpObjectList>, OmpClauseList> t;
+// Ref: [4.5:310-312], [5.0:156-158], [5.1:181-184], [5.2:176-177],
+//      [6.0:310-312]
+//
+// allocate-directive ->
+//    ALLOCATE (variable-list-item...) |            // since 4.5
+//    ALLOCATE (variable-list-item...)              // since 5.0, until 5.1
+//    ...
+//    allocate-stmt
+//
+// The first form is the "declarative-allocate", and is a declarative
+// directive. The second is the "executable-allocate" and is an executable
+// directive. The executable form was deprecated in 5.2.
+//
+// The executable-allocate consists of several ALLOCATE directives. Since
+// in the parse tree every type corresponding to a directive only corresponds
+// to a single directive, the executable form is represented by a sequence
+// of nested OmpAlocateDirectives, e.g.
+//    !$OMP ALLOCATE(x)
+//    !$OMP ALLOCATE(y)
+//    ALLOCATE(x, y)
+// will become
+//    OmpAllocateDirective
+//    |- ALLOCATE(x)            // begin directive
+//    `- OmpAllocateDirective   // block
+//       |- ALLOCATE(y)            // begin directive
+//       `- ALLOCATE(x, y)         // block
+//
+// The block in the declarative-allocate will be empty.
+struct OmpAllocateDirective : public OmpBlockConstruct {
+  INHERITED_TUPLE_CLASS_BOILERPLATE(OmpAllocateDirective, OmpBlockConstruct);
 };
 
 struct OpenMPDeclarativeConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPDeclarativeConstruct);
   CharBlock source;
-  std::variant<OpenMPDeclarativeAllocate, OpenMPDeclarativeAssumes,
+  std::variant<OmpAllocateDirective, OpenMPDeclarativeAssumes,
       OpenMPDeclareMapperConstruct, OpenMPDeclareReductionConstruct,
       OpenMPDeclareSimdConstruct, OpenMPDeclareTargetConstruct,
       OmpDeclareVariantDirective, OpenMPGroupprivate, OpenMPThreadprivate,
@@ -5174,19 +5199,6 @@ struct OpenMPCriticalConstruct : public OmpBlockConstruct {
   INHERITED_TUPLE_CLASS_BOILERPLATE(OpenMPCriticalConstruct, OmpBlockConstruct);
 };
 
-// 2.11.3 allocate -> ALLOCATE [(variable-name-list)] [clause]
-//        [ALLOCATE (variable-name-list) [clause] [...]]
-//        allocate-statement
-//        clause -> allocator-clause
-struct OpenMPExecutableAllocate {
-  TUPLE_CLASS_BOILERPLATE(OpenMPExecutableAllocate);
-  CharBlock source;
-  std::tuple<Verbatim, std::optional<OmpObjectList>, OmpClauseList,
-      std::optional<std::list<OpenMPDeclarativeAllocate>>,
-      Statement<AllocateStmt>>
-      t;
-};
-
 // Ref: [5.2:180-181], [6.0:315]
 //
 // allocators-construct ->
@@ -5342,9 +5354,9 @@ struct OpenMPConstruct {
   UNION_CLASS_BOILERPLATE(OpenMPConstruct);
   std::variant<OpenMPStandaloneConstruct, OpenMPSectionsConstruct,
       OpenMPSectionConstruct, OpenMPLoopConstruct, OmpBlockConstruct,
-      OpenMPAtomicConstruct, OpenMPDeclarativeAllocate, OpenMPDispatchConstruct,
-      OpenMPUtilityConstruct, OpenMPExecutableAllocate,
-      OpenMPAllocatorsConstruct, OpenMPAssumeConstruct, OpenMPCriticalConstruct>
+      OpenMPAtomicConstruct, OmpAllocateDirective, OpenMPDispatchConstruct,
+      OpenMPUtilityConstruct, OpenMPAllocatorsConstruct, OpenMPAssumeConstruct,
+      OpenMPCriticalConstruct>
       u;
 };
 
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 71067283d13f7..ad456d89bc432 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3503,12 +3503,12 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPUtilityConstruct &);
 
-static void
-genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
-       semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
-       const parser::OpenMPDeclarativeAllocate &declarativeAllocate) {
+static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
+                   semantics::SemanticsContext &semaCtx,
+                   lower::pft::Evaluation &eval,
+                   const parser::OmpAllocateDirective &allocate) {
   if (!semaCtx.langOptions().OpenMPSimd)
-    TODO(converter.getCurrentLocation(), "OpenMPDeclarativeAllocate");
+    TODO(converter.getCurrentLocation(), "OmpAllocateDirective");
 }
 
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
@@ -3899,14 +3899,6 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
     TODO(converter.getCurrentLocation(), "OpenMPDispatchConstruct");
 }
 
-static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
-                   semantics::SemanticsContext &semaCtx,
-                   lower::pft::Evaluation &eval,
-                   const parser::OpenMPExecutableAllocate &execAllocConstruct) {
-  if (!semaCtx.langOptions().OpenMPSimd)
-    TODO(converter.getCurrentLocation(), "OpenMPExecutableAllocate");
-}
-
 static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt
index 1f95259a857da..37c9c2d703c76 100644
--- a/flang/lib/Optimizer/Builder/CMakeLists.txt
+++ b/flang/lib/Optimizer/Builder/CMakeLists.txt
@@ -5,6 +5,7 @@ add_flang_library(FIRBuilder
   BoxValue.cpp
   Character.cpp
   Complex.cpp
+  CUDAIntrinsicCall.cpp
   CUFCommon.cpp
   DoLoopHelper.cpp
   FIRBuilder.cpp
diff --git a/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
new file mode 100644
index 0000000000000..6312e61f5e62a
--- /dev/null
+++ b/flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp
@@ -0,0 +1,1587 @@
+//===-- CUDAIntrinsicCall.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Helper routines for constructing the FIR dialect of MLIR for PowerPC
+// intrinsics. Extensive use of MLIR interfaces and MLIR's coding style
+// (https://mlir.llvm.org/getting_started/DeveloperGuide/) is used in this
+// module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
+#include "flang/Evaluate/common.h"
+#include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/MutableBox.h"
+#include "mlir/Dialect/Index/IR/IndexOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+namespace fir {
+
+using CI = CUDAIntrinsicLibrary;
+
+static const char __ldca_i4x4[] = "__ldca_i4x4_";
+static const char __ldca_i8x2[] = "__ldca_i8x2_";
+static const char __ldca_r2x2[] = "__ldca_r2x2_";
+static const char __ldca_r4x4[] = "__ldca_r4x4_";
+static const char __ldca_r8x2[] = "__ldca_r8x2_";
+static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
+static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
+static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
+static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
+static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
+static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
+static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
+static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
+static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
+static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
+static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
+static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
+static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
+static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
+static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
+static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
+static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
+static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
+static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
+static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
+
+// CUDA specific intrinsic handlers.
+static constexpr IntrinsicHandler cudaHandlers[]{
+    {"__ldca_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldca_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldca_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcg_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcg_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcs_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcs_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldcv_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldcv_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_i4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_i4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_i8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_i8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r2x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r4x4, 4>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"__ldlu_r8x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genLDXXFunc<__ldlu_r8x2, 2>),
+     {{{"a", asAddr}}},
+     /*isElemental=*/false},
+    {"all_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::all>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"any_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::any>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"atomicadd_r4x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicadd_r4x4",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<4>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAdd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicaddr2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicAddR2),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddvector_r2x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicaddvector_r4x2",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(
+         &CI::genAtomicAddVector<2>),
+     {{{"a", asAddr}, {"v", asAddr}}},
+     false},
+    {"atomicandi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicAnd),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomiccasd",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasf",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasi",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomiccasul",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicCas),
+     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
+     false},
+    {"atomicdeci",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicDec),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchd",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchf",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchi",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicexchul",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicExch),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicinci",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicInc),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmaxl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMax),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmind",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicminf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicmini",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicminl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicMin),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicori",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicOr),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubi",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicsubl",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genAtomicSub),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"atomicxori",
+     static_cast<CUDAIntrinsicLibrary::ExtendedGenerator>(&CI::genAtomicXor),
+     {{{"a", asAddr}, {"v", asValue}}},
+     false},
+    {"ballot_sync",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>),
+     {{{"mask", asValue}, {"pred", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_arrive",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierArrive),
+     {{{"barrier", asAddr}}},
+     /*isElemental=*/false},
+    {"barrier_arrive_cnt",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierArriveCnt),
+     {{{"barrier", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_init",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genBarrierInit),
+     {{{"barrier", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_try_wait",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierTryWait),
+     {{{"barrier", asAddr}, {"token", asValue}}},
+     /*isElemental=*/false},
+    {"barrier_try_wait_sleep",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genBarrierTryWaitSleep),
+     {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
+     /*isElemental=*/false},
+    {"clock",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::ClockOp>),
+     {},
+     /*isElemental=*/false},
+    {"clock64",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::Clock64Op>),
+     {},
+     /*isElemental=*/false},
+    {"fence_proxy_async",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genFenceProxyAsync),
+     {},
+     /*isElemental=*/false},
+    {"globaltimer",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genNVVMTime<mlir::NVVM::GlobalTimerOp>),
+     {},
+     /*isElemental=*/false},
+    {"match_all_syncjd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjj",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjx",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAllSync),
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_any_syncjd",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjf",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjj",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"match_any_syncjx",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genMatchAnySync),
+     {{{"mask", asValue}, {"value", asValue}}},
+     /*isElemental=*/false},
+    {"syncthreads",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genSyncThreads),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_and_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsAnd),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_and_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsAnd),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_count_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsCount),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_count_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsCount),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_or_i4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsOr),
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_or_l4",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genSyncThreadsOr),
+     {},
+     /*isElemental=*/false},
+    {"syncwarp",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genSyncWarp),
+     {},
+     /*isElemental=*/false},
+    {"this_grid",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisGrid),
+     {},
+     /*isElemental=*/false},
+    {"this_thread_block",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(
+         &CI::genThisThreadBlock),
+     {},
+     /*isElemental=*/false},
+    {"this_warp",
+     static_cast<CUDAIntrinsicLibrary::ElementalGenerator>(&CI::genThisWarp),
+     {},
+     /*isElemental=*/false},
+    {"threadfence",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFence),
+     {},
+     /*isElemental=*/false},
+    {"threadfence_block",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFenceBlock),
+     {},
+     /*isElemental=*/false},
+    {"threadfence_system",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genThreadFenceSystem),
+     {},
+     /*isElemental=*/false},
+    {"tma_bulk_commit_group",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkCommitGroup),
+     {{}},
+     /*isElemental=*/false},
+    {"tma_bulk_g2s",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkG2S),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nbytes", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldc4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadC4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldc8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadC8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadI4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldi8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadI8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr2",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR2),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR4),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_ldr8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkLoadR8),
+     {{{"barrier", asAddr},
+       {"src", asAddr},
+       {"dst", asAddr},
+       {"nelems", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_s2g",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(&CI::genTMABulkS2G),
+     {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_c4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreC4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_c8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreC8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreI4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_i8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreI8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r2",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR2),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r4",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR4),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_store_r8",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkStoreR8),
+     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
+     /*isElemental=*/false},
+    {"tma_bulk_wait_group",
+     static_cast<CUDAIntrinsicLibrary::SubroutineGenerator>(
+         &CI::genTMABulkWaitGroup),
+     {{}},
+     /*isElemental=*/false},
+};
+
+template <std::size_t N>
+static constexpr bool isSorted(const IntrinsicHandler (&array)[N]) {
+  // Replace by std::sorted when C++20 is default (will be constexpr).
+  const IntrinsicHandler *lastSeen{nullptr};
+  bool isSorted{true};
+  for (const auto &x : array) {
+    if (lastSeen)
+      isSorted &= std::string_view{lastSeen->name} < std::string_view{x.name};
+    lastSeen = &x;
+  }
+  return isSorted;
+}
+static_assert(isSorted(cudaHandlers) && "map must be sorted");
+
+const IntrinsicHandler *findCUDAIntrinsicHandler(llvm::StringRef name) {
+  auto compare = [](const IntrinsicHandler &cudaHandler, llvm::StringRef name) {
+    return name.compare(cudaHandler.name) > 0;
+  };
+  auto result = llvm::lower_bound(cudaHandlers, name, compare);
+  return result != std::end(cudaHandlers) && result->name == name ? result
+                                                                  : nullptr;
+}
+
+static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
+                                         mlir::Location loc,
+                                         mlir::Value barrier,
+                                         mlir::NVVM::NVVMMemorySpace space) {
+  mlir::Value llvmPtr = fir::ConvertOp::create(
+      builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
+      barrier);
+  mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
+      builder, loc,
+      mlir::LLVM::LLVMPointerType::get(builder.getContext(),
+                                       static_cast<unsigned>(space)),
+      llvmPtr);
+  return addrCast;
+}
+
+static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
+                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
+                                mlir::Value arg1) {
+  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  arg0 = builder.createConvert(loc, llvmPointerType, arg0);
+  return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
+                                         mlir::LLVM::AtomicOrdering::seq_cst);
+}
+
+// ATOMICADD
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::add
+          : mlir::LLVM::AtomicBinOp::fadd;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicAddR2(mlir::Type resultType,
+                                     llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+
+  mlir::Value a = fir::getBase(args[0]);
+
+  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+    a = fir::BoxAddrOp::create(builder, loc, a);
+  }
+
+  auto loc = builder.getUnknownLoc();
+  auto f16Ty = builder.getF16Type();
+  auto i32Ty = builder.getI32Type();
+  auto vecF16Ty = mlir::VectorType::get({2}, f16Ty);
+  mlir::Type idxTy = builder.getIndexType();
+  auto f16RefTy = fir::ReferenceType::get(f16Ty);
+  auto zero = builder.createIntegerConstant(loc, idxTy, 0);
+  auto one = builder.createIntegerConstant(loc, idxTy, 1);
+  auto v1Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+                                           fir::getBase(args[1]), zero);
+  auto v2Coord = fir::CoordinateOp::create(builder, loc, f16RefTy,
+                                           fir::getBase(args[1]), one);
+  auto v1 = fir::LoadOp::create(builder, loc, v1Coord);
+  auto v2 = fir::LoadOp::create(builder, loc, v2Coord);
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecF16Ty);
+  mlir::Value vec1 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, undef, v1, builder.createIntegerConstant(loc, i32Ty, 0));
+  mlir::Value vec2 = mlir::LLVM::InsertElementOp::create(
+      builder, loc, vec1, v2, builder.createIntegerConstant(loc, i32Ty, 1));
+  auto res = genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, vec2);
+  auto i32VecTy = mlir::VectorType::get({1}, i32Ty);
+  mlir::Value vecI32 =
+      mlir::vector::BitCastOp::create(builder, loc, i32VecTy, res);
+  return mlir::vector::ExtractOp::create(builder, loc, vecI32,
+                                         mlir::ArrayRef<int64_t>{0});
+}
+
+// ATOMICADDVECTOR
+template <int extent>
+fir::ExtendedValue CUDAIntrinsicLibrary::genAtomicAddVector(
+    mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(
+      builder, loc, fir::SequenceType::get({extent}, resultType));
+  mlir::Value a = fir::getBase(args[0]);
+  if (mlir::isa<fir::BaseBoxType>(a.getType())) {
+    a = fir::BoxAddrOp::create(builder, loc, a);
+  }
+  auto vecTy = mlir::VectorType::get({extent}, resultType);
+  auto refTy = fir::ReferenceType::get(resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Type idxTy = builder.getIndexType();
+
+  // Extract the values from the array.
+  llvm::SmallVector<mlir::Value> values;
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value pos = builder.createIntegerConstant(loc, idxTy, i);
+    mlir::Value coord = fir::CoordinateOp::create(builder, loc, refTy,
+                                                  fir::getBase(args[1]), pos);
+    mlir::Value value = fir::LoadOp::create(builder, loc, coord);
+    values.push_back(value);
+  }
+  // Pack extracted values into a vector to call the atomic add.
+  mlir::Value undef = mlir::LLVM::UndefOp::create(builder, loc, vecTy);
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value insert = mlir::LLVM::InsertElementOp::create(
+        builder, loc, undef, values[i],
+        builder.createIntegerConstant(loc, i32Ty, i));
+    undef = insert;
+  }
+  // Atomic operation with a vector of values.
+  mlir::Value add =
+      genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::fadd, a, undef);
+  // Store results in the result array.
+  for (unsigned i = 0; i < extent; ++i) {
+    mlir::Value r = mlir::LLVM::ExtractElementOp::create(
+        builder, loc, add, builder.createIntegerConstant(loc, i32Ty, i));
+    mlir::Value c = fir::CoordinateOp::create(
+        builder, loc, refTy, res, builder.createIntegerConstant(loc, idxTy, i));
+    fir::StoreOp::create(builder, loc, r, c);
+  }
+  mlir::Value ext = builder.createIntegerConstant(loc, idxTy, extent);
+  return fir::ArrayBoxValue(res, {ext});
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicOr(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICCAS
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicCas(mlir::Type resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
+  auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
+
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  mlir::Value arg2 = fir::getBase(args[2]);
+
+  auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
+    if (mlir::isa<mlir::Float32Type>(arg.getType()))
+      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
+                                           arg);
+    if (mlir::isa<mlir::Float64Type>(arg.getType()))
+      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
+                                           arg);
+    return arg;
+  };
+
+  arg1 = bitCastFloat(arg1);
+  arg2 = bitCastFloat(arg2);
+
+  if (arg1.getType() != arg2.getType()) {
+    // arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
+    arg2 = builder.createConvert(loc, arg1.getType(), arg2);
+  }
+
+  auto address =
+      mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
+          .getResult(0);
+  auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
+      builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
+  mlir::Value boolResult =
+      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  return builder.createConvert(loc, resultType, boolResult);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicDec(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICEXCH
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicExch(mlir::Type resultType,
+                                    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  assert(arg1.getType().isIntOrFloat());
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
+  return genAtomBinOp(builder, loc, binOp, arg0, arg1);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicInc(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
+
+  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicMax(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::max
+          : mlir::LLVM::AtomicBinOp::fmax;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicMin(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::min
+          : mlir::LLVM::AtomicBinOp::fmin;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICSUB
+mlir::Value
+CUDAIntrinsicLibrary::genAtomicSub(mlir::Type resultType,
+                                   llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::LLVM::AtomicBinOp binOp =
+      mlir::isa<mlir::IntegerType>(args[1].getType())
+          ? mlir::LLVM::AtomicBinOp::sub
+          : mlir::LLVM::AtomicBinOp::fsub;
+  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
+}
+
+// ATOMICXOR
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genAtomicXor(mlir::Type resultType,
+                                   llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value arg0 = fir::getBase(args[0]);
+  mlir::Value arg1 = fir::getBase(args[1]);
+  return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
+}
+
+// BARRIER_ARRIVE
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 1);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
+  return mlir::NVVM::MBarrierArriveOp::create(builder, loc, resultType, barrier)
+      .getResult();
+}
+
+// BARRIER_ARRIBVE_CNT
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
+  return mlir::NVVM::InlinePtxOp::create(builder, loc, {resultType},
+                                         {barrier, args[1]}, {},
+                                         "mbarrier.arrive.expect_tx.release."
+                                         "cta.shared::cta.b64 %0, [%1], %2;",
+                                         {})
+      .getResult(0);
+}
+
+// BARRIER_INIT
+void CUDAIntrinsicLibrary::genBarrierInit(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 2);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier,
+                                     fir::getBase(args[1]), {});
+  auto kind = mlir::NVVM::ProxyKindAttr::get(
+      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+  auto space = mlir::NVVM::SharedSpaceAttr::get(
+      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
+// BARRIER_TRY_WAIT
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
+                                        llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
+  fir::StoreOp::create(builder, loc, zero, res);
+  mlir::Value ns =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
+  mlir::Value load = fir::LoadOp::create(builder, loc, res);
+  auto whileOp = mlir::scf::WhileOp::create(
+      builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
+  mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
+  mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(beforeBlock);
+  mlir::Value condition = mlir::arith::CmpIOp::create(
+      builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
+  mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
+  mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
+  afterBlock->addArgument(resultType, loc);
+  builder.setInsertionPointToStart(afterBlock);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  mlir::Value ret = mlir::NVVM::InlinePtxOp::create(
+                        builder, loc, {resultType}, {barrier, args[1], ns}, {},
+                        "{\n"
+                        "  .reg .pred p;\n"
+                        "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+                        "  selp.b32 %0, 1, 0, p;\n"
+                        "}",
+                        {})
+                        .getResult(0);
+  mlir::scf::YieldOp::create(builder, loc, ret);
+  builder.setInsertionPointAfter(whileOp);
+  return whileOp.getResult(0);
+}
+
+// BARRIER_TRY_WAIT_SLEEP
+mlir::Value
+CUDAIntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
+                                             llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
+  return mlir::NVVM::InlinePtxOp::create(
+             builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
+             "{\n"
+             "  .reg .pred p;\n"
+             "  mbarrier.try_wait.shared.b64 p, [%1], %2, %3;\n"
+             "  selp.b32 %0, 1, 0, p;\n"
+             "}",
+             {})
+      .getResult(0);
+}
+
+// FENCE_PROXY_ASYNC
+void CUDAIntrinsicLibrary::genFenceProxyAsync(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  auto kind = mlir::NVVM::ProxyKindAttr::get(
+      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
+  auto space = mlir::NVVM::SharedSpaceAttr::get(
+      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
+  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
+}
+
+// __LDCA, __LDCS, __LDLU, __LDCV
+template <const char *fctName, int extent>
+fir::ExtendedValue
+CUDAIntrinsicLibrary::genLDXXFunc(mlir::Type resultType,
+                                  llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  mlir::Type resTy = fir::SequenceType::get(extent, resultType);
+  mlir::Value arg = fir::getBase(args[0]);
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
+  if (mlir::isa<fir::BaseBoxType>(arg.getType()))
+    arg = fir::BoxAddrOp::create(builder, loc, arg);
+  mlir::Type refResTy = fir::ReferenceType::get(resTy);
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
+  auto funcOp = builder.createFunction(loc, fctName, ftype);
+  llvm::SmallVector<mlir::Value> funcArgs;
+  funcArgs.push_back(res);
+  funcArgs.push_back(arg);
+  fir::CallOp::create(builder, loc, funcOp, funcArgs);
+  mlir::Value ext =
+      builder.createIntegerConstant(loc, builder.getIndexType(), extent);
+  return fir::ArrayBoxValue(res, {ext});
+}
+
+// CLOCK, CLOCK64, GLOBALTIMER
+template <typename OpTy>
+mlir::Value
+CUDAIntrinsicLibrary::genNVVMTime(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0 && "expect no arguments");
+  return OpTy::create(builder, loc, resultType).getResult();
+}
+
+// MATCH_ALL_SYNC
+mlir::Value
+CUDAIntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
+                                      llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+  mlir::Type i1Ty = builder.getI1Type();
+  mlir::MLIRContext *context = builder.getContext();
+
+  mlir::Value arg1 = args[1];
+  if (arg1.getType().isF32() || arg1.getType().isF64())
+    arg1 = fir::ConvertOp::create(
+        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+
+  mlir::Type retTy =
+      mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
+  auto match =
+      mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
+                                      mlir::NVVM::MatchSyncKind::all)
+          .getResult();
+  auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
+  auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
+  auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
+  fir::StoreOp::create(builder, loc, conv, args[2]);
+  return value;
+}
+
+// MATCH_ANY_SYNC
+mlir::Value
+CUDAIntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
+                                      llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+  mlir::Value arg1 = args[1];
+  if (arg1.getType().isF32() || arg1.getType().isF64())
+    arg1 = fir::ConvertOp::create(
+        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
+
+  return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
+                                         arg1, mlir::NVVM::MatchSyncKind::any)
+      .getResult();
+}
+
+// SYNCTHREADS
+void CUDAIntrinsicLibrary::genSyncThreads(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  mlir::NVVM::Barrier0Op::create(builder, loc);
+}
+
+// SYNCTHREADS_AND
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
+                                        llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCTHREADS_COUNT
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
+                                          llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCTHREADS_OR
+mlir::Value
+CUDAIntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
+                                       llvm::ArrayRef<mlir::Value> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {resultType}, {i32});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
+}
+
+// SYNCWARP
+void CUDAIntrinsicLibrary::genSyncWarp(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 1);
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync";
+  mlir::Value mask = fir::getBase(args[0]);
+  mlir::FunctionType funcType =
+      mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {});
+  auto funcOp = builder.createFunction(loc, funcName, funcType);
+  llvm::SmallVector<mlir::Value> argsList{mask};
+  fir::CallOp::create(builder, loc, funcOp, argsList);
+}
+
+// THIS_GRID
+mlir::Value
+CUDAIntrinsicLibrary::genThisGrid(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
+
+  mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
+  mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
+  mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
+
+  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
+  mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
+
+  // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
+  // (blockDim.x * gridDim.x);
+  mlir::Value resZ =
+      mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
+  mlir::Value resY =
+      mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
+  mlir::Value resX =
+      mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
+  mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
+
+  // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
+  //   blockIdx.x;
+  // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
+  //   ((threadIdx.z * blockDim.y) * blockDim.x) +
+  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
+  mlir::Value r1 =
+      mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
+  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
+  mlir::Value r3 =
+      mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
+  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+  mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
+
+  mlir::Value bXbY =
+      mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
+  mlir::Value bXbYbZ =
+      mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
+  mlir::Value tZbY =
+      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+  mlir::Value tZbYbX =
+      mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
+  mlir::Value tYbX =
+      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+  mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THIS_THREAD_BLOCK
+mlir::Value
+CUDAIntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
+                                         llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
+  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
+  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
+  mlir::Value size =
+      mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
+  size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
+
+  // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
+  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
+  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
+  mlir::Value r1 =
+      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
+  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
+  mlir::Value r3 =
+      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
+  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
+  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
+
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THIS_WARP
+mlir::Value
+CUDAIntrinsicLibrary::genThisWarp(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 0);
+  auto recTy = mlir::cast<fir::RecordType>(resultType);
+  assert(recTy && "RecordType expepected");
+  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
+  mlir::Type i32Ty = builder.getI32Type();
+
+  // coalesced_group%size = 32
+  mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
+  auto sizeFieldName = recTy.getTypeList()[1].first;
+  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
+  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
+  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, sizeFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value sizeCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
+  fir::StoreOp::create(builder, loc, size, sizeCoord);
+
+  // coalesced_group%rank = threadIdx.x & 31 + 1
+  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
+  mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
+  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
+  mlir::Value masked =
+      mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
+  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
+  auto rankFieldName = recTy.getTypeList()[2].first;
+  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
+  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
+      builder, loc, fieldIndexType, rankFieldName, recTy,
+      /*typeParams=*/mlir::ValueRange{});
+  mlir::Value rankCoord = fir::CoordinateOp::create(
+      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
+  fir::StoreOp::create(builder, loc, rank, rankCoord);
+  return res;
+}
+
+// THREADFENCE
+void CUDAIntrinsicLibrary::genThreadFence(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
+  mlir::FunctionType funcType =
+      mlir::FunctionType::get(builder.getContext(), {}, {});
+  auto funcOp = builder.createFunction(loc, funcName, funcType);
+  llvm::SmallVector<mlir::Value> noArgs;
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
+}
+
+// THREADFENCE_BLOCK
+void CUDAIntrinsicLibrary::genThreadFenceBlock(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
+  mlir::FunctionType funcType =
+      mlir::FunctionType::get(builder.getContext(), {}, {});
+  auto funcOp = builder.createFunction(loc, funcName, funcType);
+  llvm::SmallVector<mlir::Value> noArgs;
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
+}
+
+// THREADFENCE_SYSTEM
+void CUDAIntrinsicLibrary::genThreadFenceSystem(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
+  mlir::FunctionType funcType =
+      mlir::FunctionType::get(builder.getContext(), {}, {});
+  auto funcOp = builder.createFunction(loc, funcName, funcType);
+  llvm::SmallVector<mlir::Value> noArgs;
+  fir::CallOp::create(builder, loc, funcOp, noArgs);
+}
+
+// TMA_BULK_COMMIT_GROUP
+void CUDAIntrinsicLibrary::genTMABulkCommitGroup(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
+}
+
+// TMA_BULK_G2S
+void CUDAIntrinsicLibrary::genTMABulkG2S(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value barrier = convertPtrToNVVMSpace(
+      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::Value dst =
+      convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
+                            mlir::NVVM::NVVMMemorySpace::SharedCluster);
+  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+                                          mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
+      builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
+}
+
+static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
+                           mlir::Value barrier, mlir::Value src,
+                           mlir::Value dst, mlir::Value nelem,
+                           mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
+  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
+  barrier = builder.createConvert(loc, llvmPtrTy, barrier);
+  dst = builder.createConvert(loc, llvmPtrTy, dst);
+  src = builder.createConvert(loc, llvmPtrTy, src);
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
+      "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
+      "[%1], %2, [%3];",
+      {});
+  mlir::NVVM::InlinePtxOp::create(
+      builder, loc, mlir::TypeRange{}, {barrier, size}, {},
+      "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
+}
+
+// TMA_BULK_LOADC4
+void CUDAIntrinsicLibrary::genTMABulkLoadC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADC8
+void CUDAIntrinsicLibrary::genTMABulkLoadC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI4
+void CUDAIntrinsicLibrary::genTMABulkLoadI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADI8
+void CUDAIntrinsicLibrary::genTMABulkLoadI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR2
+void CUDAIntrinsicLibrary::genTMABulkLoadR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR4
+void CUDAIntrinsicLibrary::genTMABulkLoadR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_LOADR8
+void CUDAIntrinsicLibrary::genTMABulkLoadR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 4);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
+}
+
+// TMA_BULK_S2G
+void CUDAIntrinsicLibrary::genTMABulkS2G(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
+                                          mlir::NVVM::NVVMMemorySpace::Shared);
+  mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
+                                          mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
+      builder, loc, dst, src, fir::getBase(args[2]), {}, {});
+
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group;", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
+                            mlir::Value src, mlir::Value dst, mlir::Value count,
+                            mlir::Value eleSize) {
+  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
+  src = convertPtrToNVVMSpace(builder, loc, src,
+                              mlir::NVVM::NVVMMemorySpace::Shared);
+  dst = convertPtrToNVVMSpace(builder, loc, dst,
+                              mlir::NVVM::NVVMMemorySpace::Global);
+  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
+                                                     size, {}, {});
+  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
+                                  "cp.async.bulk.commit_group;", {});
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
+                                             builder.getI32IntegerAttr(0), {});
+}
+
+// TMA_BULK_STORE_C4
+void CUDAIntrinsicLibrary::genTMABulkStoreC4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_C8
+void CUDAIntrinsicLibrary::genTMABulkStoreC8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I4
+void CUDAIntrinsicLibrary::genTMABulkStoreI4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_I8
+void CUDAIntrinsicLibrary::genTMABulkStoreI8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R2
+void CUDAIntrinsicLibrary::genTMABulkStoreR2(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R4
+void CUDAIntrinsicLibrary::genTMABulkStoreR4(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_STORE_R8
+void CUDAIntrinsicLibrary::genTMABulkStoreR8(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 3);
+  mlir::Value eleSize =
+      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
+  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
+                  fir::getBase(args[2]), eleSize);
+}
+
+// TMA_BULK_WAIT_GROUP
+void CUDAIntrinsicLibrary::genTMABulkWaitGroup(
+    llvm::ArrayRef<fir::ExtendedValue> args) {
+  assert(args.size() == 0);
+  auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
+  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
+}
+
+// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
+template <mlir::NVVM::VoteSyncKind kind>
+mlir::Value
+CUDAIntrinsicLibrary::genVoteSync(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 2);
+  mlir::Value arg1 =
+      fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
+  mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
+                         ? builder.getI32Type()
+                         : builder.getI1Type();
+  auto voteRes =
+      mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
+          .getResult();
+  return fir::ConvertOp::create(builder, loc, resultType, voteRes);
+}
+
+} // namespace fir
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 15ea84565dd75..3eb60448fae38 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -16,6 +16,7 @@
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Common/static-multimap-view.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
+#include "flang/Optimizer/Builder/CUDAIntrinsicCall.h"
 #include "flang/Optimizer/Builder/CUFCommon.h"
 #include "flang/Optimizer/Builder/Character.h"
 #include "flang/Optimizer/Builder/Complex.h"
@@ -50,7 +51,6 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -108,34 +108,6 @@ using I = IntrinsicLibrary;
 /// argument is an optional variable in the current scope).
 static constexpr bool handleDynamicOptional = true;
 
-/// TODO: Move all CUDA Fortran intrinsic handlers into its own file similar to
-/// PPC.
-static const char __ldca_i4x4[] = "__ldca_i4x4_";
-static const char __ldca_i8x2[] = "__ldca_i8x2_";
-static const char __ldca_r2x2[] = "__ldca_r2x2_";
-static const char __ldca_r4x4[] = "__ldca_r4x4_";
-static const char __ldca_r8x2[] = "__ldca_r8x2_";
-static const char __ldcg_i4x4[] = "__ldcg_i4x4_";
-static const char __ldcg_i8x2[] = "__ldcg_i8x2_";
-static const char __ldcg_r2x2[] = "__ldcg_r2x2_";
-static const char __ldcg_r4x4[] = "__ldcg_r4x4_";
-static const char __ldcg_r8x2[] = "__ldcg_r8x2_";
-static const char __ldcs_i4x4[] = "__ldcs_i4x4_";
-static const char __ldcs_i8x2[] = "__ldcs_i8x2_";
-static const char __ldcs_r2x2[] = "__ldcs_r2x2_";
-static const char __ldcs_r4x4[] = "__ldcs_r4x4_";
-static const char __ldcs_r8x2[] = "__ldcs_r8x2_";
-static const char __ldcv_i4x4[] = "__ldcv_i4x4_";
-static const char __ldcv_i8x2[] = "__ldcv_i8x2_";
-static const char __ldcv_r2x2[] = "__ldcv_r2x2_";
-static const char __ldcv_r4x4[] = "__ldcv_r4x4_";
-static const char __ldcv_r8x2[] = "__ldcv_r8x2_";
-static const char __ldlu_i4x4[] = "__ldlu_i4x4_";
-static const char __ldlu_i8x2[] = "__ldlu_i8x2_";
-static const char __ldlu_r2x2[] = "__ldlu_r2x2_";
-static const char __ldlu_r4x4[] = "__ldlu_r4x4_";
-static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
-
 /// Table that drives the fir generation depending on the intrinsic or intrinsic
 /// module procedure one to one mapping with Fortran arguments. If no mapping is
 /// defined here for a generic intrinsic, genRuntimeCall will be called
@@ -144,106 +116,6 @@ static const char __ldlu_r8x2[] = "__ldlu_r8x2_";
 /// argument must not be lowered by value. In which case, the lowering rules
 /// should be provided for all the intrinsic arguments for completeness.
 static constexpr IntrinsicHandler handlers[]{
-    {"__ldca_i4x4",
-     &I::genCUDALDXXFunc<__ldca_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_i8x2",
-     &I::genCUDALDXXFunc<__ldca_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r2x2",
-     &I::genCUDALDXXFunc<__ldca_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r4x4",
-     &I::genCUDALDXXFunc<__ldca_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldca_r8x2",
-     &I::genCUDALDXXFunc<__ldca_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_i4x4",
-     &I::genCUDALDXXFunc<__ldcg_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_i8x2",
-     &I::genCUDALDXXFunc<__ldcg_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r2x2",
-     &I::genCUDALDXXFunc<__ldcg_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r4x4",
-     &I::genCUDALDXXFunc<__ldcg_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcg_r8x2",
-     &I::genCUDALDXXFunc<__ldcg_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_i4x4",
-     &I::genCUDALDXXFunc<__ldcs_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_i8x2",
-     &I::genCUDALDXXFunc<__ldcs_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r2x2",
-     &I::genCUDALDXXFunc<__ldcs_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r4x4",
-     &I::genCUDALDXXFunc<__ldcs_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcs_r8x2",
-     &I::genCUDALDXXFunc<__ldcs_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_i4x4",
-     &I::genCUDALDXXFunc<__ldcv_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_i8x2",
-     &I::genCUDALDXXFunc<__ldcv_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r2x2",
-     &I::genCUDALDXXFunc<__ldcv_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r4x4",
-     &I::genCUDALDXXFunc<__ldcv_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldcv_r8x2",
-     &I::genCUDALDXXFunc<__ldcv_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_i4x4",
-     &I::genCUDALDXXFunc<__ldlu_i4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_i8x2",
-     &I::genCUDALDXXFunc<__ldlu_i8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r2x2",
-     &I::genCUDALDXXFunc<__ldlu_r2x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r4x4",
-     &I::genCUDALDXXFunc<__ldlu_r4x4, 4>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
-    {"__ldlu_r8x2",
-     &I::genCUDALDXXFunc<__ldlu_r8x2, 2>,
-     {{{"a", asAddr}}},
-     /*isElemental=*/false},
     {"abort", &I::genAbort},
     {"abs", &I::genAbs},
     {"achar", &I::genChar},
@@ -263,10 +135,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genAll,
      {{{"mask", asAddr}, {"dim", asValue}}},
      /*isElemental=*/false},
-    {"all_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::all>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
     {"allocated",
      &I::genAllocated,
      {{{"array", asInquired}, {"scalar", asInquired}}},
@@ -276,10 +144,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genAny,
      {{{"mask", asAddr}, {"dim", asValue}}},
      /*isElemental=*/false},
-    {"any_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::any>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
     {"asind", &I::genAsind},
     {"asinpi", &I::genAsinpi},
     {"associated",
@@ -290,83 +154,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"atan2pi", &I::genAtanpi},
     {"atand", &I::genAtand},
     {"atanpi", &I::genAtanpi},
-    {"atomicaddd", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddf", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddi", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicaddl", &I::genAtomicAdd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicandi", &I::genAtomicAnd, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomiccasd",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasf",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasi",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomiccasul",
-     &I::genAtomicCas,
-     {{{"a", asAddr}, {"v1", asValue}, {"v2", asValue}}},
-     false},
-    {"atomicdeci", &I::genAtomicDec, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicexchd",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchf",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchi",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicexchul",
-     &I::genAtomicExch,
-     {{{"a", asAddr}, {"v", asValue}}},
-     false},
-    {"atomicinci", &I::genAtomicInc, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxd", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxf", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxi", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmaxl", &I::genAtomicMax, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmind", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicminf", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicmini", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicminl", &I::genAtomicMin, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicori", &I::genAtomicOr, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubd", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubf", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubi", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicsubl", &I::genAtomicSub, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"atomicxori", &I::genAtomicXor, {{{"a", asAddr}, {"v", asValue}}}, false},
-    {"ballot_sync",
-     &I::genVoteSync<mlir::NVVM::VoteSyncKind::ballot>,
-     {{{"mask", asValue}, {"pred", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_arrive",
-     &I::genBarrierArrive,
-     {{{"barrier", asAddr}}},
-     /*isElemental=*/false},
-    {"barrier_arrive_cnt",
-     &I::genBarrierArriveCnt,
-     {{{"barrier", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_init",
-     &I::genBarrierInit,
-     {{{"barrier", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_try_wait",
-     &I::genBarrierTryWait,
-     {{{"barrier", asAddr}, {"token", asValue}}},
-     /*isElemental=*/false},
-    {"barrier_try_wait_sleep",
-     &I::genBarrierTryWaitSleep,
-     {{{"barrier", asAddr}, {"token", asValue}, {"ns", asValue}}},
-     /*isElemental=*/false},
     {"bessel_jn",
      &I::genBesselJn,
      {{{"n1", asValue}, {"n2", asValue}, {"x", asValue}}},
@@ -410,11 +197,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genChdir,
      {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"clock", &I::genNVVMTime<mlir::NVVM::ClockOp>, {}, /*isElemental=*/false},
-    {"clock64",
-     &I::genNVVMTime<mlir::NVVM::Clock64Op>,
-     {},
-     /*isElemental=*/false},
     {"cmplx",
      &I::genCmplx,
      {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}},
@@ -511,10 +293,6 @@ static constexpr IntrinsicHandler handlers[]{
      &I::genExtendsTypeOf,
      {{{"a", asBox}, {"mold", asBox}}},
      /*isElemental=*/false},
-    {"fence_proxy_async",
-     &I::genFenceProxyAsync,
-     {},
-     /*isElemental=*/false},
     {"findloc",
      &I::genFindloc,
      {{{"array", asBox},
@@ -569,10 +347,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"getgid", &I::genGetGID},
     {"getpid", &I::genGetPID},
     {"getuid", &I::genGetUID},
-    {"globaltimer",
-     &I::genNVVMTime<mlir::NVVM::GlobalTimerOp>,
-     {},
-     /*isElemental=*/false},
     {"hostnm",
      &I::genHostnm,
      {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}},
@@ -740,38 +514,6 @@ static constexpr IntrinsicHandler handlers[]{
     {"malloc", &I::genMalloc},
     {"maskl", &I::genMask<mlir::arith::ShLIOp>},
     {"maskr", &I::genMask<mlir::arith::ShRUIOp>},
-    {"match_all_syncjd",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjf",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjj",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_all_syncjx",
-     &I::genMatchAllSync,
-     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
-     /*isElemental=*/false},
-    {"match_any_syncjd",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjf",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjj",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
-    {"match_any_syncjx",
-     &I::genMatchAnySync,
-     {{{"mask", asValue}, {"value", asValue}}},
-     /*isElemental=*/false},
     {"matmul",
      &I::genMatmul,
      {{{"matrix_a", asAddr}, {"matrix_b", asAddr}}},
@@ -997,20 +739,6 @@ static constexpr IntrinsicHandler handlers[]{
        {"dim", asValue},
        {"mask", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false},
-    {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
-    {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
-    {"syncthreads_count_i4",
-     &I::genSyncThreadsCount,
-     {},
-     /*isElemental=*/false},
-    {"syncthreads_count_l4",
-     &I::genSyncThreadsCount,
-     {},
-     /*isElemental=*/false},
-    {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
-    {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
-    {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false},
     {"system",
      &I::genSystem,
      {{{"command", asBox}, {"exitstat", asBox, handleDynamicOptional}}},
@@ -1021,115 +749,13 @@ static constexpr IntrinsicHandler handlers[]{
      /*isElemental=*/false},
     {"tand", &I::genTand},
     {"tanpi", &I::genTanpi},
-    {"this_grid", &I::genThisGrid, {}, /*isElemental=*/false},
     {"this_image",
      &I::genThisImage,
      {{{"coarray", asBox},
        {"dim", asAddr},
        {"team", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
-    {"this_thread_block", &I::genThisThreadBlock, {}, /*isElemental=*/false},
-    {"this_warp", &I::genThisWarp, {}, /*isElemental=*/false},
-    {"threadfence", &I::genThreadFence, {}, /*isElemental=*/false},
-    {"threadfence_block", &I::genThreadFenceBlock, {}, /*isElemental=*/false},
-    {"threadfence_system", &I::genThreadFenceSystem, {}, /*isElemental=*/false},
     {"time", &I::genTime, {}, /*isElemental=*/false},
-    {"tma_bulk_commit_group",
-     &I::genTMABulkCommitGroup,
-     {{}},
-     /*isElemental=*/false},
-    {"tma_bulk_g2s",
-     &I::genTMABulkG2S,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nbytes", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldc4",
-     &I::genTMABulkLoadC4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldc8",
-     &I::genTMABulkLoadC8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldi4",
-     &I::genTMABulkLoadI4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldi8",
-     &I::genTMABulkLoadI8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr2",
-     &I::genTMABulkLoadR2,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr4",
-     &I::genTMABulkLoadR4,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_ldr8",
-     &I::genTMABulkLoadR8,
-     {{{"barrier", asAddr},
-       {"src", asAddr},
-       {"dst", asAddr},
-       {"nelems", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_s2g",
-     &I::genTMABulkS2G,
-     {{{"src", asAddr}, {"dst", asAddr}, {"nbytes", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_c4",
-     &I::genTMABulkStoreC4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_c8",
-     &I::genTMABulkStoreC8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_i4",
-     &I::genTMABulkStoreI4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_i8",
-     &I::genTMABulkStoreI8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r2",
-     &I::genTMABulkStoreR2,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r4",
-     &I::genTMABulkStoreR4,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_store_r8",
-     &I::genTMABulkStoreR8,
-     {{{"src", asAddr}, {"dst", asAddr}, {"count", asValue}}},
-     /*isElemental=*/false},
-    {"tma_bulk_wait_group",
-     &I::genTMABulkWaitGroup,
-     {{}},
-     /*isElemental=*/false},
     {"trailz", &I::genTrailz},
     {"transfer",
      &I::genTransfer,
@@ -2221,6 +1847,9 @@ lookupIntrinsicHandler(fir::FirOpBuilder &builder,
   if (isPPCTarget)
     if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
       return std::make_optional<IntrinsicHandlerEntry>(ppcHandler);
+  // TODO: Look for CUDA intrinsic handlers only if CUDA is enabled.
+  if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name))
+    return std::make_optional<IntrinsicHandlerEntry>(cudaHandler);
   // Subroutines should have a handler.
   if (!resultType)
     return std::nullopt;
@@ -3107,159 +2736,6 @@ mlir::Value IntrinsicLibrary::genAtanpi(mlir::Type resultType,
   return mlir::arith::MulFOp::create(builder, loc, atan, factor);
 }
 
-static mlir::Value genAtomBinOp(fir::FirOpBuilder &builder, mlir::Location &loc,
-                                mlir::LLVM::AtomicBinOp binOp, mlir::Value arg0,
-                                mlir::Value arg1) {
-  auto llvmPointerType = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  arg0 = builder.createConvert(loc, llvmPointerType, arg0);
-  return mlir::LLVM::AtomicRMWOp::create(builder, loc, binOp, arg0, arg1,
-                                         mlir::LLVM::AtomicOrdering::seq_cst);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicAdd(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::add
-          : mlir::LLVM::AtomicBinOp::fadd;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicSub(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::sub
-          : mlir::LLVM::AtomicBinOp::fsub;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicAnd(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_and;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicOr(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::_or;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICCAS
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
-                               llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  auto successOrdering = mlir::LLVM::AtomicOrdering::acq_rel;
-  auto failureOrdering = mlir::LLVM::AtomicOrdering::monotonic;
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(resultType.getContext());
-
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  mlir::Value arg2 = fir::getBase(args[2]);
-
-  auto bitCastFloat = [&](mlir::Value arg) -> mlir::Value {
-    if (mlir::isa<mlir::Float32Type>(arg.getType()))
-      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI32Type(),
-                                           arg);
-    if (mlir::isa<mlir::Float64Type>(arg.getType()))
-      return mlir::LLVM::BitcastOp::create(builder, loc, builder.getI64Type(),
-                                           arg);
-    return arg;
-  };
-
-  arg1 = bitCastFloat(arg1);
-  arg2 = bitCastFloat(arg2);
-
-  if (arg1.getType() != arg2.getType()) {
-    // arg1 and arg2 need to have the same type in AtomicCmpXchgOp.
-    arg2 = builder.createConvert(loc, arg1.getType(), arg2);
-  }
-
-  auto address =
-      mlir::UnrealizedConversionCastOp::create(builder, loc, llvmPtrTy, arg0)
-          .getResult(0);
-  auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
-      builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
-  mlir::Value boolResult =
-      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
-  return builder.createConvert(loc, resultType, boolResult);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::udec_wrap;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICEXCH
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicExch(mlir::Type resultType,
-                                llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  assert(arg1.getType().isIntOrFloat());
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::xchg;
-  return genAtomBinOp(builder, loc, binOp, arg0, arg1);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicInc(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  assert(mlir::isa<mlir::IntegerType>(args[1].getType()));
-
-  mlir::LLVM::AtomicBinOp binOp = mlir::LLVM::AtomicBinOp::uinc_wrap;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicMax(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::max
-          : mlir::LLVM::AtomicBinOp::fmax;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-mlir::Value IntrinsicLibrary::genAtomicMin(mlir::Type resultType,
-                                           llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-
-  mlir::LLVM::AtomicBinOp binOp =
-      mlir::isa<mlir::IntegerType>(args[1].getType())
-          ? mlir::LLVM::AtomicBinOp::min
-          : mlir::LLVM::AtomicBinOp::fmin;
-  return genAtomBinOp(builder, loc, binOp, args[0], args[1]);
-}
-
-// ATOMICXOR
-fir::ExtendedValue
-IntrinsicLibrary::genAtomicXor(mlir::Type resultType,
-                               llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value arg0 = fir::getBase(args[0]);
-  mlir::Value arg1 = fir::getBase(args[1]);
-  return genAtomBinOp(builder, loc, mlir::LLVM::AtomicBinOp::_xor, arg0, arg1);
-}
-
 // ASSOCIATED
 fir::ExtendedValue
 IntrinsicLibrary::genAssociated(mlir::Type resultType,
@@ -3311,114 +2787,6 @@ IntrinsicLibrary::genAssociated(mlir::Type resultType,
   return fir::runtime::genAssociated(builder, loc, pointerBox, targetBox);
 }
 
-static mlir::Value convertPtrToNVVMSpace(fir::FirOpBuilder &builder,
-                                         mlir::Location loc,
-                                         mlir::Value barrier,
-                                         mlir::NVVM::NVVMMemorySpace space) {
-  mlir::Value llvmPtr = fir::ConvertOp::create(
-      builder, loc, mlir::LLVM::LLVMPointerType::get(builder.getContext()),
-      barrier);
-  mlir::Value addrCast = mlir::LLVM::AddrSpaceCastOp::create(
-      builder, loc,
-      mlir::LLVM::LLVMPointerType::get(builder.getContext(),
-                                       static_cast<unsigned>(space)),
-      llvmPtr);
-  return addrCast;
-}
-
-// BARRIER_ARRIVE (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierArrive(mlir::Type resultType,
-                                   llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 1);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
-  return mlir::NVVM::MBarrierArriveSharedOp::create(builder, loc, resultType,
-                                                    barrier)
-      .getResult();
-}
-
-// BARRIER_ARRIBVE_CNT (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierArriveCnt(mlir::Type resultType,
-                                      llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, args[0], mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value token = fir::AllocaOp::create(builder, loc, resultType);
-  // TODO: the MBarrierArriveExpectTxOp is not taking the state argument and
-  // currently just the sink symbol `_`.
-  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
-  mlir::NVVM::MBarrierArriveExpectTxOp::create(builder, loc, barrier, args[1],
-                                               {});
-  return fir::LoadOp::create(builder, loc, token);
-}
-
-// BARRIER_INIT (CUDA)
-void IntrinsicLibrary::genBarrierInit(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 2);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::NVVM::MBarrierInitOp::create(builder, loc, barrier,
-                                     fir::getBase(args[1]), {});
-  auto kind = mlir::NVVM::ProxyKindAttr::get(
-      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
-  auto space = mlir::NVVM::SharedSpaceAttr::get(
-      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
-  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
-}
-
-// BARRIER_TRY_WAIT (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierTryWait(mlir::Type resultType,
-                                    llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Value zero = builder.createIntegerConstant(loc, resultType, 0);
-  fir::StoreOp::create(builder, loc, zero, res);
-  mlir::Value ns =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 1000000);
-  mlir::Value load = fir::LoadOp::create(builder, loc, res);
-  auto whileOp = mlir::scf::WhileOp::create(
-      builder, loc, mlir::TypeRange{resultType}, mlir::ValueRange{load});
-  mlir::Block *beforeBlock = builder.createBlock(&whileOp.getBefore());
-  mlir::Value beforeArg = beforeBlock->addArgument(resultType, loc);
-  builder.setInsertionPointToStart(beforeBlock);
-  mlir::Value condition = mlir::arith::CmpIOp::create(
-      builder, loc, mlir::arith::CmpIPredicate::ne, beforeArg, zero);
-  mlir::scf::ConditionOp::create(builder, loc, condition, beforeArg);
-  mlir::Block *afterBlock = builder.createBlock(&whileOp.getAfter());
-  afterBlock->addArgument(resultType, loc);
-  builder.setInsertionPointToStart(afterBlock);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
-  mlir::Value ret =
-      mlir::NVVM::InlinePtxOp::create(
-          builder, loc, {resultType}, {barrier, args[1], ns}, {},
-          ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
-          "selp.b32 %0, 1, 0, p;",
-          {})
-          .getResult(0);
-  mlir::scf::YieldOp::create(builder, loc, ret);
-  builder.setInsertionPointAfter(whileOp);
-  return whileOp.getResult(0);
-}
-
-// BARRIER_TRY_WAIT_SLEEP (CUDA)
-mlir::Value
-IntrinsicLibrary::genBarrierTryWaitSleep(mlir::Type resultType,
-                                         llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 3);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  auto barrier = builder.createConvert(loc, llvmPtrTy, args[0]);
-  return mlir::NVVM::InlinePtxOp::create(
-             builder, loc, {resultType}, {barrier, args[1], args[2]}, {},
-             ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%1], %2, %3; "
-             "selp.b32 %0, 1, 0, p;",
-             {})
-      .getResult(0);
-}
-
 // BESSEL_JN
 fir::ExtendedValue
 IntrinsicLibrary::genBesselJn(mlir::Type resultType,
@@ -4152,30 +3520,6 @@ IntrinsicLibrary::genCshift(mlir::Type resultType,
   return readAndAddCleanUp(resultMutableBox, resultType, "CSHIFT");
 }
 
-// __LDCA, __LDCS, __LDLU, __LDCV
-template <const char *fctName, int extent>
-fir::ExtendedValue
-IntrinsicLibrary::genCUDALDXXFunc(mlir::Type resultType,
-                                  llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 1);
-  mlir::Type resTy = fir::SequenceType::get(extent, resultType);
-  mlir::Value arg = fir::getBase(args[0]);
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resTy);
-  if (mlir::isa<fir::BaseBoxType>(arg.getType()))
-    arg = fir::BoxAddrOp::create(builder, loc, arg);
-  mlir::Type refResTy = fir::ReferenceType::get(resTy);
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(arg.getContext(), {refResTy, refResTy}, {});
-  auto funcOp = builder.createFunction(loc, fctName, ftype);
-  llvm::SmallVector<mlir::Value> funcArgs;
-  funcArgs.push_back(res);
-  funcArgs.push_back(arg);
-  fir::CallOp::create(builder, loc, funcOp, funcArgs);
-  mlir::Value ext =
-      builder.createIntegerConstant(loc, builder.getIndexType(), extent);
-  return fir::ArrayBoxValue(res, {ext});
-}
-
 // DATE_AND_TIME
 void IntrinsicLibrary::genDateAndTime(llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 4 && "date_and_time has 4 args");
@@ -4508,17 +3852,6 @@ IntrinsicLibrary::genExtendsTypeOf(mlir::Type resultType,
                                      fir::getBase(args[1])));
 }
 
-// FENCE_PROXY_ASYNC (CUDA)
-void IntrinsicLibrary::genFenceProxyAsync(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  auto kind = mlir::NVVM::ProxyKindAttr::get(
-      builder.getContext(), mlir::NVVM::ProxyKind::async_shared);
-  auto space = mlir::NVVM::SharedSpaceAttr::get(
-      builder.getContext(), mlir::NVVM::SharedSpace::shared_cta);
-  mlir::NVVM::FenceProxyOp::create(builder, loc, kind, space);
-}
-
 // FINDLOC
 fir::ExtendedValue
 IntrinsicLibrary::genFindloc(mlir::Type resultType,
@@ -7029,67 +6362,6 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType,
   return result;
 }
 
-// MATCH_ALL_SYNC
-mlir::Value
-IntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
-                                  llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 3);
-  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
-
-  mlir::Type i1Ty = builder.getI1Type();
-  mlir::MLIRContext *context = builder.getContext();
-
-  mlir::Value arg1 = args[1];
-  if (arg1.getType().isF32() || arg1.getType().isF64())
-    arg1 = fir::ConvertOp::create(
-        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
-
-  mlir::Type retTy =
-      mlir::LLVM::LLVMStructType::getLiteral(context, {resultType, i1Ty});
-  auto match =
-      mlir::NVVM::MatchSyncOp::create(builder, loc, retTy, args[0], arg1,
-                                      mlir::NVVM::MatchSyncKind::all)
-          .getResult();
-  auto value = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 0);
-  auto pred = mlir::LLVM::ExtractValueOp::create(builder, loc, match, 1);
-  auto conv = mlir::LLVM::ZExtOp::create(builder, loc, resultType, pred);
-  fir::StoreOp::create(builder, loc, conv, args[2]);
-  return value;
-}
-
-// ALL_SYNC, ANY_SYNC, BALLOT_SYNC
-template <mlir::NVVM::VoteSyncKind kind>
-mlir::Value IntrinsicLibrary::genVoteSync(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  mlir::Value arg1 =
-      fir::ConvertOp::create(builder, loc, builder.getI1Type(), args[1]);
-  mlir::Type resTy = kind == mlir::NVVM::VoteSyncKind::ballot
-                         ? builder.getI32Type()
-                         : builder.getI1Type();
-  auto voteRes =
-      mlir::NVVM::VoteSyncOp::create(builder, loc, resTy, args[0], arg1, kind)
-          .getResult();
-  return fir::ConvertOp::create(builder, loc, resultType, voteRes);
-}
-
-// MATCH_ANY_SYNC
-mlir::Value
-IntrinsicLibrary::genMatchAnySync(mlir::Type resultType,
-                                  llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 2);
-  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
-
-  mlir::Value arg1 = args[1];
-  if (arg1.getType().isF32() || arg1.getType().isF64())
-    arg1 = fir::ConvertOp::create(
-        builder, loc, is32 ? builder.getI32Type() : builder.getI64Type(), arg1);
-
-  return mlir::NVVM::MatchSyncOp::create(builder, loc, resultType, args[0],
-                                         arg1, mlir::NVVM::MatchSyncKind::any)
-      .getResult();
-}
-
 // MATMUL
 fir::ExtendedValue
 IntrinsicLibrary::genMatmul(mlir::Type resultType,
@@ -7707,14 +6979,6 @@ IntrinsicLibrary::genNumImages(mlir::Type resultType,
   return mif::NumImagesOp::create(builder, loc).getResult();
 }
 
-// CLOCK, CLOCK64, GLOBALTIMER
-template <typename OpTy>
-mlir::Value IntrinsicLibrary::genNVVMTime(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0 && "expect no arguments");
-  return OpTy::create(builder, loc, resultType).getResult();
-}
-
 // PACK
 fir::ExtendedValue
 IntrinsicLibrary::genPack(mlir::Type resultType,
@@ -8689,92 +7953,6 @@ mlir::Value IntrinsicLibrary::genTanpi(mlir::Type resultType,
   return getRuntimeCallGenerator("tan", ftype)(builder, loc, {arg});
 }
 
-// THIS_GRID
-mlir::Value IntrinsicLibrary::genThisGrid(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
-
-  mlir::Value blockIdX = mlir::NVVM::BlockIdXOp::create(builder, loc, i32Ty);
-  mlir::Value blockIdY = mlir::NVVM::BlockIdYOp::create(builder, loc, i32Ty);
-  mlir::Value blockIdZ = mlir::NVVM::BlockIdZOp::create(builder, loc, i32Ty);
-
-  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimX = mlir::NVVM::GridDimXOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimY = mlir::NVVM::GridDimYOp::create(builder, loc, i32Ty);
-  mlir::Value gridDimZ = mlir::NVVM::GridDimZOp::create(builder, loc, i32Ty);
-
-  // this_grid.size = ((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y)) *
-  // (blockDim.x * gridDim.x);
-  mlir::Value resZ =
-      mlir::arith::MulIOp::create(builder, loc, blockDimZ, gridDimZ);
-  mlir::Value resY =
-      mlir::arith::MulIOp::create(builder, loc, blockDimY, gridDimY);
-  mlir::Value resX =
-      mlir::arith::MulIOp::create(builder, loc, blockDimX, gridDimX);
-  mlir::Value resZY = mlir::arith::MulIOp::create(builder, loc, resZ, resY);
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, resZY, resX);
-
-  // tmp = ((blockIdx.z * gridDim.y * gridDim.x) + (blockIdx.y * gridDim.x)) +
-  //   blockIdx.x;
-  // this_group.rank = tmp * ((blockDim.x * blockDim.y) * blockDim.z) +
-  //   ((threadIdx.z * blockDim.y) * blockDim.x) +
-  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
-  mlir::Value r1 =
-      mlir::arith::MulIOp::create(builder, loc, blockIdZ, gridDimY);
-  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, gridDimX);
-  mlir::Value r3 =
-      mlir::arith::MulIOp::create(builder, loc, blockIdY, gridDimX);
-  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
-  mlir::Value tmp = mlir::arith::AddIOp::create(builder, loc, r2r3, blockIdX);
-
-  mlir::Value bXbY =
-      mlir::arith::MulIOp::create(builder, loc, blockDimX, blockDimY);
-  mlir::Value bXbYbZ =
-      mlir::arith::MulIOp::create(builder, loc, bXbY, blockDimZ);
-  mlir::Value tZbY =
-      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
-  mlir::Value tZbYbX =
-      mlir::arith::MulIOp::create(builder, loc, tZbY, blockDimX);
-  mlir::Value tYbX =
-      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
-  mlir::Value rank = mlir::arith::MulIOp::create(builder, loc, tmp, bXbYbZ);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, tZbYbX);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, tYbX);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, threadIdX);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
-
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
-}
-
 // THIS_IMAGE
 fir::ExtendedValue
 IntrinsicLibrary::genThisImage(mlir::Type resultType,
@@ -8790,99 +7968,6 @@ IntrinsicLibrary::genThisImage(mlir::Type resultType,
   return builder.createConvert(loc, resultType, res);
 }
 
-// THIS_THREAD_BLOCK
-mlir::Value
-IntrinsicLibrary::genThisThreadBlock(mlir::Type resultType,
-                                     llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  // this_thread_block%size = blockDim.z * blockDim.y * blockDim.x;
-  mlir::Value blockDimX = mlir::NVVM::BlockDimXOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimY = mlir::NVVM::BlockDimYOp::create(builder, loc, i32Ty);
-  mlir::Value blockDimZ = mlir::NVVM::BlockDimZOp::create(builder, loc, i32Ty);
-  mlir::Value size =
-      mlir::arith::MulIOp::create(builder, loc, blockDimZ, blockDimY);
-  size = mlir::arith::MulIOp::create(builder, loc, size, blockDimX);
-
-  // this_thread_block%rank = ((threadIdx.z * blockDim.y) * blockDim.x) +
-  //   (threadIdx.y * blockDim.x) + threadIdx.x + 1;
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdY = mlir::NVVM::ThreadIdYOp::create(builder, loc, i32Ty);
-  mlir::Value threadIdZ = mlir::NVVM::ThreadIdZOp::create(builder, loc, i32Ty);
-  mlir::Value r1 =
-      mlir::arith::MulIOp::create(builder, loc, threadIdZ, blockDimY);
-  mlir::Value r2 = mlir::arith::MulIOp::create(builder, loc, r1, blockDimX);
-  mlir::Value r3 =
-      mlir::arith::MulIOp::create(builder, loc, threadIdY, blockDimX);
-  mlir::Value r2r3 = mlir::arith::AddIOp::create(builder, loc, r2, r3);
-  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, r2r3, threadIdX);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  rank = mlir::arith::AddIOp::create(builder, loc, rank, one);
-
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
-}
-
-// THIS_WARP
-mlir::Value IntrinsicLibrary::genThisWarp(mlir::Type resultType,
-                                          llvm::ArrayRef<mlir::Value> args) {
-  assert(args.size() == 0);
-  auto recTy = mlir::cast<fir::RecordType>(resultType);
-  assert(recTy && "RecordType expepected");
-  mlir::Value res = fir::AllocaOp::create(builder, loc, resultType);
-  mlir::Type i32Ty = builder.getI32Type();
-
-  // coalesced_group%size = 32
-  mlir::Value size = builder.createIntegerConstant(loc, i32Ty, 32);
-  auto sizeFieldName = recTy.getTypeList()[1].first;
-  mlir::Type sizeFieldTy = recTy.getTypeList()[1].second;
-  mlir::Type fieldIndexType = fir::FieldType::get(resultType.getContext());
-  mlir::Value sizeFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, sizeFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value sizeCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(sizeFieldTy), res, sizeFieldIndex);
-  fir::StoreOp::create(builder, loc, size, sizeCoord);
-
-  // coalesced_group%rank = threadIdx.x & 31 + 1
-  mlir::Value threadIdX = mlir::NVVM::ThreadIdXOp::create(builder, loc, i32Ty);
-  mlir::Value mask = builder.createIntegerConstant(loc, i32Ty, 31);
-  mlir::Value one = builder.createIntegerConstant(loc, i32Ty, 1);
-  mlir::Value masked =
-      mlir::arith::AndIOp::create(builder, loc, threadIdX, mask);
-  mlir::Value rank = mlir::arith::AddIOp::create(builder, loc, masked, one);
-  auto rankFieldName = recTy.getTypeList()[2].first;
-  mlir::Type rankFieldTy = recTy.getTypeList()[2].second;
-  mlir::Value rankFieldIndex = fir::FieldIndexOp::create(
-      builder, loc, fieldIndexType, rankFieldName, recTy,
-      /*typeParams=*/mlir::ValueRange{});
-  mlir::Value rankCoord = fir::CoordinateOp::create(
-      builder, loc, builder.getRefType(rankFieldTy), res, rankFieldIndex);
-  fir::StoreOp::create(builder, loc, rank, rankCoord);
-  return res;
-}
-
 // TRAILZ
 mlir::Value IntrinsicLibrary::genTrailz(mlir::Type resultType,
                                         llvm::ArrayRef<mlir::Value> args) {
@@ -9104,65 +8189,6 @@ IntrinsicLibrary::genSum(mlir::Type resultType,
                       resultType, args);
 }
 
-// SYNCTHREADS
-void IntrinsicLibrary::genSyncThreads(llvm::ArrayRef<fir::ExtendedValue> args) {
-  mlir::NVVM::Barrier0Op::create(builder, loc);
-}
-
-// SYNCTHREADS_AND
-mlir::Value
-IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
-                                    llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCTHREADS_COUNT
-mlir::Value
-IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
-                                      llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCTHREADS_OR
-mlir::Value
-IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
-                                   llvm::ArrayRef<mlir::Value> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
-  mlir::MLIRContext *context = builder.getContext();
-  mlir::Type i32 = builder.getI32Type();
-  mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {i32});
-  auto funcOp = builder.createFunction(loc, funcName, ftype);
-  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
-  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
-}
-
-// SYNCWARP
-void IntrinsicLibrary::genSyncWarp(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 1);
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.bar.warp.sync";
-  mlir::Value mask = fir::getBase(args[0]);
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {mask.getType()}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> argsList{mask};
-  fir::CallOp::create(builder, loc, funcOp, argsList);
-}
-
 // SYSTEM
 fir::ExtendedValue
 IntrinsicLibrary::genSystem(std::optional<mlir::Type> resultType,
@@ -9294,38 +8320,6 @@ IntrinsicLibrary::genTranspose(mlir::Type resultType,
   return readAndAddCleanUp(resultMutableBox, resultType, "TRANSPOSE");
 }
 
-// THREADFENCE
-void IntrinsicLibrary::genThreadFence(llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.gl";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
-// THREADFENCE_BLOCK
-void IntrinsicLibrary::genThreadFenceBlock(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.cta";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
-// THREADFENCE_SYSTEM
-void IntrinsicLibrary::genThreadFenceSystem(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  constexpr llvm::StringLiteral funcName = "llvm.nvvm.membar.sys";
-  mlir::FunctionType funcType =
-      mlir::FunctionType::get(builder.getContext(), {}, {});
-  auto funcOp = builder.createFunction(loc, funcName, funcType);
-  llvm::SmallVector<mlir::Value> noArgs;
-  fir::CallOp::create(builder, loc, funcOp, noArgs);
-}
-
 // TIME
 mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
@@ -9334,226 +8328,6 @@ mlir::Value IntrinsicLibrary::genTime(mlir::Type resultType,
                                fir::runtime::genTime(builder, loc));
 }
 
-// TMA_BULK_COMMIT_GROUP (CUDA)
-void IntrinsicLibrary::genTMABulkCommitGroup(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  mlir::NVVM::CpAsyncBulkCommitGroupOp::create(builder, loc);
-}
-
-// TMA_BULK_G2S (CUDA)
-void IntrinsicLibrary::genTMABulkG2S(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value barrier = convertPtrToNVVMSpace(
-      builder, loc, fir::getBase(args[0]), mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value dst =
-      convertPtrToNVVMSpace(builder, loc, fir::getBase(args[2]),
-                            mlir::NVVM::NVVMMemorySpace::SharedCluster);
-  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
-                                          mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkGlobalToSharedClusterOp::create(
-      builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
-}
-
-static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
-                           mlir::Value barrier, mlir::Value src,
-                           mlir::Value dst, mlir::Value nelem,
-                           mlir::Value eleSize) {
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
-  auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
-  barrier = builder.createConvert(loc, llvmPtrTy, barrier);
-  dst = builder.createConvert(loc, llvmPtrTy, dst);
-  src = builder.createConvert(loc, llvmPtrTy, src);
-  mlir::NVVM::InlinePtxOp::create(
-      builder, loc, mlir::TypeRange{}, {dst, src, size, barrier}, {},
-      "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], "
-      "[%1], %2, [%3];",
-      {});
-  mlir::NVVM::InlinePtxOp::create(
-      builder, loc, mlir::TypeRange{}, {barrier, size}, {},
-      "mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;", {});
-}
-
-// TMA_BULK_LOADC4
-void IntrinsicLibrary::genTMABulkLoadC4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADC8
-void IntrinsicLibrary::genTMABulkLoadC8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADI4
-void IntrinsicLibrary::genTMABulkLoadI4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADI8
-void IntrinsicLibrary::genTMABulkLoadI8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR2
-void IntrinsicLibrary::genTMABulkLoadR2(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR4
-void IntrinsicLibrary::genTMABulkLoadR4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_LOADR8
-void IntrinsicLibrary::genTMABulkLoadR8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 4);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkLoad(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                 fir::getBase(args[2]), fir::getBase(args[3]), eleSize);
-}
-
-// TMA_BULK_S2G (CUDA)
-void IntrinsicLibrary::genTMABulkS2G(llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value src = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[0]),
-                                          mlir::NVVM::NVVMMemorySpace::Shared);
-  mlir::Value dst = convertPtrToNVVMSpace(builder, loc, fir::getBase(args[1]),
-                                          mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(
-      builder, loc, dst, src, fir::getBase(args[2]), {}, {});
-
-  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
-                                  "cp.async.bulk.commit_group", {});
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
-                                             builder.getI32IntegerAttr(0), {});
-}
-
-static void genTMABulkStore(fir::FirOpBuilder &builder, mlir::Location loc,
-                            mlir::Value src, mlir::Value dst, mlir::Value count,
-                            mlir::Value eleSize) {
-  mlir::Value size = mlir::arith::MulIOp::create(builder, loc, eleSize, count);
-  src = convertPtrToNVVMSpace(builder, loc, src,
-                              mlir::NVVM::NVVMMemorySpace::Shared);
-  dst = convertPtrToNVVMSpace(builder, loc, dst,
-                              mlir::NVVM::NVVMMemorySpace::Global);
-  mlir::NVVM::CpAsyncBulkSharedCTAToGlobalOp::create(builder, loc, dst, src,
-                                                     size, {}, {});
-  mlir::NVVM::InlinePtxOp::create(builder, loc, mlir::TypeRange{}, {}, {},
-                                  "cp.async.bulk.commit_group", {});
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc,
-                                             builder.getI32IntegerAttr(0), {});
-}
-
-// TMA_BULK_STORE_C4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreC4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_C8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreC8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 16);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_I4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreI4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_I8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreI8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R2 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR2(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 2);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R4 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR4(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 4);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_STORE_R8 (CUDA)
-void IntrinsicLibrary::genTMABulkStoreR8(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 3);
-  mlir::Value eleSize =
-      builder.createIntegerConstant(loc, builder.getI32Type(), 8);
-  genTMABulkStore(builder, loc, fir::getBase(args[0]), fir::getBase(args[1]),
-                  fir::getBase(args[2]), eleSize);
-}
-
-// TMA_BULK_WAIT_GROUP (CUDA)
-void IntrinsicLibrary::genTMABulkWaitGroup(
-    llvm::ArrayRef<fir::ExtendedValue> args) {
-  assert(args.size() == 0);
-  auto group = builder.getIntegerAttr(builder.getI32Type(), 0);
-  mlir::NVVM::CpAsyncBulkWaitGroupOp::create(builder, loc, group, {});
-}
-
 // TRIM
 fir::ExtendedValue
 IntrinsicLibrary::genTrim(mlir::Type resultType,
@@ -9968,6 +8742,9 @@ getIntrinsicArgumentLowering(llvm::StringRef specificName) {
   if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
     if (!ppcHandler->argLoweringRules.hasDefaultRules())
       return &ppcHandler->argLoweringRules;
+  if (const IntrinsicHandler *cudaHandler = findCUDAIntrinsicHandler(name))
+    if (!cudaHandler->argLoweringRules.hasDefaultRules())
+      return &cudaHandler->argLoweringRules;
   return nullptr;
 }
 
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index a9de26ea09ff8..4374acbbe51bf 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1778,6 +1778,31 @@ struct OmpBlockConstructParser {
   llvm::omp::Directive dir_;
 };
 
+struct OmpDeclarativeAllocateParser {
+  using resultType = OmpAllocateDirective;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    constexpr llvm::omp::Directive dir{llvm::omp::Directive::OMPD_allocate};
+    if (auto &&begin{attempt(OmpBeginDirectiveParser(dir)).Parse(state)}) {
+      Block empty;
+      auto end{maybe(OmpEndDirectiveParser{dir}).Parse(state)};
+      return OmpAllocateDirective{std::move(*begin), std::move(empty),
+          llvm::transformOptional(std::move(*end),
+              [](auto &&s) { return OmpEndDirective(std::move(s)); })};
+    }
+    return std::nullopt;
+  }
+};
+
+struct OmpExecutableAllocateParser {
+  using resultType = OmpAllocateDirective;
+
+  std::optional<resultType> Parse(ParseState &state) const {
+    OmpStatementConstructParser p{llvm::omp::Directive::OMPD_allocate};
+    return construct<OmpAllocateDirective>(p).Parse(state);
+  }
+};
+
 TYPE_PARSER(sourced(construct<OpenMPAllocatorsConstruct>(
     OmpStatementConstructParser{llvm::omp::Directive::OMPD_allocators})))
 
@@ -2044,14 +2069,6 @@ TYPE_PARSER(construct<OmpInitializerExpression>(OmpStylizedExpressionParser{}))
 TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>(
     OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical})))
 
-// 2.11.3 Executable Allocate directive
-TYPE_PARSER(sourced(construct<OpenMPExecutableAllocate>(
-    verbatim("ALLOCATE"_tok), maybe(parenthesized(Parser<OmpObjectList>{})),
-    Parser<OmpClauseList>{},
-    maybe(nonemptyList(startOmpLine >> Parser<OpenMPDeclarativeAllocate>{})) /
-        endOmpLine,
-    statement(allocateStmt))))
-
 // 2.8.2 Declare Simd construct
 TYPE_PARSER(sourced(construct<OpenMPDeclareSimdConstruct>(
     predicated(Parser<OmpDirectiveName>{},
@@ -2077,13 +2094,6 @@ TYPE_PARSER(sourced( //
             IsDirective(llvm::omp::Directive::OMPD_threadprivate)) >=
         Parser<OmpDirectiveSpecification>{})))
 
-// 2.11.3 Declarative Allocate directive
-TYPE_PARSER(
-    sourced(construct<OpenMPDeclarativeAllocate>(verbatim("ALLOCATE"_tok),
-        maybe(parenthesized(Parser<OmpObjectList>{})),
-        Parser<OmpClauseList>{})) /
-    lookAhead(endOmpLine / !statement(allocateStmt)))
-
 // Assumes Construct
 TYPE_PARSER(sourced(construct<OpenMPDeclarativeAssumes>(
     predicated(OmpDirectiveNameParser{},
@@ -2106,7 +2116,7 @@ TYPE_PARSER(
                             construct<OpenMPDeclarativeConstruct>(
                                 Parser<OmpDeclareVariantDirective>{}) ||
                             construct<OpenMPDeclarativeConstruct>(
-                                Parser<OpenMPDeclarativeAllocate>{}) ||
+                                sourced(OmpDeclarativeAllocateParser{})) ||
                             construct<OpenMPDeclarativeConstruct>(
                                 Parser<OpenMPGroupprivate>{}) ||
                             construct<OpenMPDeclarativeConstruct>(
@@ -2194,6 +2204,8 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
         withMessage("expected OpenMP construct"_err_en_US,
             first(construct<OpenMPConstruct>(Parser<OpenMPSectionsConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPLoopConstruct>{}),
+                construct<OpenMPConstruct>(
+                    sourced(OmpExecutableAllocateParser{})),
                 construct<OpenMPConstruct>(Parser<OmpBlockConstruct>{}),
                 // OmpBlockConstruct is attempted before
                 // OpenMPStandaloneConstruct to resolve !$OMP ORDERED
@@ -2201,9 +2213,7 @@ TYPE_CONTEXT_PARSER("OpenMP construct"_en_US,
                 construct<OpenMPConstruct>(Parser<OpenMPAtomicConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPUtilityConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPDispatchConstruct>{}),
-                construct<OpenMPConstruct>(Parser<OpenMPExecutableAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAllocatorsConstruct>{}),
-                construct<OpenMPConstruct>(Parser<OpenMPDeclarativeAllocate>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPAssumeConstruct>{}),
                 construct<OpenMPConstruct>(Parser<OpenMPCriticalConstruct>{}))))
 
diff --git a/flang/lib/Parser/openmp-utils.cpp b/flang/lib/Parser/openmp-utils.cpp
index 95ad3f60770f5..b9d3763cdd06d 100644
--- a/flang/lib/Parser/openmp-utils.cpp
+++ b/flang/lib/Parser/openmp-utils.cpp
@@ -22,6 +22,25 @@
 
 namespace Fortran::parser::omp {
 
+const OpenMPDeclarativeConstruct *GetOmp(const DeclarationConstruct &x) {
+  if (auto *y = std::get_if<SpecificationConstruct>(&x.u)) {
+    if (auto *z{std::get_if<common::Indirection<OpenMPDeclarativeConstruct>>(
+            &y->u)}) {
+      return &z->value();
+    }
+  }
+  return nullptr;
+}
+
+const OpenMPConstruct *GetOmp(const ExecutionPartConstruct &x) {
+  if (auto *y{std::get_if<ExecutableConstruct>(&x.u)}) {
+    if (auto *z{std::get_if<common::Indirection<OpenMPConstruct>>(&y->u)}) {
+      return &z->value();
+    }
+  }
+  return nullptr;
+}
+
 const OmpObjectList *GetOmpObjectList(const OmpClause &clause) {
   // Clauses with OmpObjectList as its data member
   using MemberObjectListClauses = std::tuple<OmpClause::Copyin,
@@ -86,4 +105,24 @@ const OmpInitializerExpression *GetInitializerExpr(const OmpClause &init) {
   return nullptr;
 }
 
+static void SplitOmpAllocateHelper(
+    OmpAllocateInfo &n, const OmpAllocateDirective &x) {
+  n.dirs.push_back(&x);
+  const Block &body{std::get<Block>(x.t)};
+  if (!body.empty()) {
+    if (auto *omp{GetOmp(body.front())}) {
+      if (auto *ad{std::get_if<OmpAllocateDirective>(&omp->u)}) {
+        return SplitOmpAllocateHelper(n, *ad);
+      }
+    }
+    n.body = &body.front();
+  }
+}
+
+OmpAllocateInfo SplitOmpAllocate(const OmpAllocateDirective &x) {
+  OmpAllocateInfo info;
+  SplitOmpAllocateHelper(info, x);
+  return info;
+}
+
 } // namespace Fortran::parser::omp
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index efce8fc3d2e35..8cccd84f9fa19 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -557,7 +557,7 @@ bool Prescanner::MustSkipToEndOfLine() const {
     return true; // skip over ignored columns in right margin (73:80)
   } else if (*at_ == '!' && !inCharLiteral_ &&
       (!inFixedForm_ || tabInCurrentLine_ || column_ != 6)) {
-    return !IsCompilerDirectiveSentinel(at_ + 1);
+    return InCompilerDirective() || !IsCompilerDirectiveSentinel(at_ + 1);
   } else {
     return false;
   }
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index 9255c4e1136bc..84123030195e9 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2482,30 +2482,8 @@ class UnparseVisitor {
     Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
 
-  void Unparse(const OpenMPExecutableAllocate &x) {
-    const auto &fields =
-        std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-            x.t);
-    if (fields) {
-      for (const auto &decl : *fields) {
-        Walk(decl);
-      }
-    }
-    BeginOpenMP();
-    Word("!$OMP ALLOCATE");
-    Walk(" (", std::get<std::optional<OmpObjectList>>(x.t), ")");
-    Walk(std::get<OmpClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
-    Walk(std::get<Statement<AllocateStmt>>(x.t));
-  }
-  void Unparse(const OpenMPDeclarativeAllocate &x) {
-    BeginOpenMP();
-    Word("!$OMP ALLOCATE");
-    Walk(" (", std::get<std::optional<OmpObjectList>>(x.t), ")");
-    Walk(std::get<OmpClauseList>(x.t));
-    Put("\n");
-    EndOpenMP();
+  void Unparse(const OmpAllocateDirective &x) {
+    Unparse(static_cast<const OmpBlockConstruct &>(x));
   }
   void Unparse(const OpenMPAllocatorsConstruct &x) {
     Unparse(static_cast<const OmpBlockConstruct &>(x));
diff --git a/flang/lib/Semantics/canonicalize-omp.cpp b/flang/lib/Semantics/canonicalize-omp.cpp
index c884658bf464a..a11c5250b1ab4 100644
--- a/flang/lib/Semantics/canonicalize-omp.cpp
+++ b/flang/lib/Semantics/canonicalize-omp.cpp
@@ -51,8 +51,6 @@ class CanonicalizationOfOmp {
     } // Block list
   }
 
-  void Post(parser::ExecutionPart &body) { RewriteOmpAllocations(body); }
-
   // Pre-visit all constructs that have both a specification part and
   // an execution part, and store the connection between the two.
   bool Pre(parser::BlockConstruct &x) {
@@ -88,6 +86,7 @@ class CanonicalizationOfOmp {
 
   void Post(parser::SpecificationPart &spec) {
     CanonicalizeUtilityConstructs(spec);
+    CanonicalizeAllocateDirectives(spec);
   }
 
   void Post(parser::OmpMapClause &map) { CanonicalizeMapModifiers(map); }
@@ -239,33 +238,138 @@ class CanonicalizationOfOmp {
     }
   }
 
-  void RewriteOmpAllocations(parser::ExecutionPart &body) {
-    // Rewrite leading declarative allocations so they are nested
-    // within their respective executable allocate directive
-    //
-    // Original:
-    //   ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //   ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //   ExecutionPartConstruct -> OpenMPExecutableAllocate
-    //
-    // After rewriting:
-    //   ExecutionPartConstruct -> OpenMPExecutableAllocate
-    //     ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    //     ExecutionPartConstruct -> OpenMPDeclarativeAllocate
-    for (auto it = body.v.rbegin(); it != body.v.rend();) {
-      if (auto *exec = GetOmpIf<parser::OpenMPExecutableAllocate>(*(it++))) {
-        parser::OpenMPDeclarativeAllocate *decl;
-        std::list<parser::OpenMPDeclarativeAllocate> subAllocates;
-        while (it != body.v.rend() &&
-            (decl = GetOmpIf<parser::OpenMPDeclarativeAllocate>(*it))) {
-          subAllocates.push_front(std::move(*decl));
-          it = decltype(it)(body.v.erase(std::next(it).base()));
+  // Canonicalization of allocate directives
+  //
+  // In OpenMP 5.0 and 5.1 the allocate directive could either be a declarative
+  // one or an executable one. As usual in such cases, this poses a problem
+  // when the directive appears at the boundary between the specification part
+  // and the execution part.
+  // The executable form can actually consist of several adjacent directives,
+  // whereas the declarative form is always standalone. Additionally, the
+  // executable form must be associated with an allocate statement.
+  //
+  // The parser tries to parse declarative statements first, so in the
+  // following case, the two directives will be declarative, even though
+  // they should be treated as a single executable form:
+  //   integer, allocatable :: x, y   ! Specification
+  //   !$omp allocate(x)
+  //   !$omp allocate(y)
+  //   allocate(x, y)                 ! Execution
+  //
+  void CanonicalizeAllocateDirectives(parser::SpecificationPart &spec) {
+    auto found = blockForSpec_.find(&spec);
+    if (found == blockForSpec_.end()) {
+      // There is no corresponding execution part, so there is nothing to do.
+      return;
+    }
+    parser::Block &block = *found->second;
+
+    auto isAllocateStmt = [](const parser::ExecutionPartConstruct &epc) {
+      if (auto *ec = std::get_if<parser::ExecutableConstruct>(&epc.u)) {
+        if (auto *as =
+                std::get_if<parser::Statement<parser::ActionStmt>>(&ec->u)) {
+          return std::holds_alternative<
+              common::Indirection<parser::AllocateStmt>>(as->statement.u);
+        }
+      }
+      return false;
+    };
+
+    if (!block.empty() && isAllocateStmt(block.front())) {
+      // There are two places where an OpenMP declarative construct can
+      // show up in the tuple in specification part:
+      // (1) in std::list<OpenMPDeclarativeConstruct>, or
+      // (2) in std::list<DeclarationConstruct>.
+      // The case (1) is only possible if the list (2) is empty.
+
+      auto &omps =
+          std::get<std::list<parser::OpenMPDeclarativeConstruct>>(spec.t);
+      auto &decls = std::get<std::list<parser::DeclarationConstruct>>(spec.t);
+
+      if (!decls.empty()) {
+        MakeExecutableAllocateFromDecls(decls, block);
+      } else {
+        MakeExecutableAllocateFromOmps(omps, block);
+      }
+    }
+  }
+
+  parser::ExecutionPartConstruct EmbedInExec(
+      parser::OmpAllocateDirective *alo, parser::ExecutionPartConstruct &&epc) {
+    // Nest current epc inside the allocate directive.
+    std::get<parser::Block>(alo->t).push_front(std::move(epc));
+    // Set the new epc to be the ExecutionPartConstruct made from
+    // the allocate directive.
+    parser::OpenMPConstruct opc(std::move(*alo));
+    common::Indirection<parser::OpenMPConstruct> ind(std::move(opc));
+    parser::ExecutableConstruct ec(std::move(ind));
+    return parser::ExecutionPartConstruct(std::move(ec));
+  }
+
+  void MakeExecutableAllocateFromDecls(
+      std::list<parser::DeclarationConstruct> &decls, parser::Block &body) {
+    using OpenMPDeclarativeConstruct =
+        common::Indirection<parser::OpenMPDeclarativeConstruct>;
+
+    auto getAllocate = [](parser::DeclarationConstruct *dc) {
+      if (auto *sc = std::get_if<parser::SpecificationConstruct>(&dc->u)) {
+        if (auto *odc = std::get_if<OpenMPDeclarativeConstruct>(&sc->u)) {
+          if (auto *alo =
+                  std::get_if<parser::OmpAllocateDirective>(&odc->value().u)) {
+            return alo;
+          }
+        }
+      }
+      return static_cast<parser::OmpAllocateDirective *>(nullptr);
+    };
+
+    std::list<parser::DeclarationConstruct>::reverse_iterator rlast = [&]() {
+      for (auto rit = decls.rbegin(), rend = decls.rend(); rit != rend; ++rit) {
+        if (getAllocate(&*rit) == nullptr) {
+          return rit;
         }
-        if (!subAllocates.empty()) {
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              exec->t) = {std::move(subAllocates)};
+      }
+      return decls.rend();
+    }();
+
+    if (rlast != decls.rbegin()) {
+      // We have already checked that the first statement in body is
+      // ALLOCATE.
+      parser::ExecutionPartConstruct epc(std::move(body.front()));
+      for (auto rit = decls.rbegin(); rit != rlast; ++rit) {
+        epc = EmbedInExec(getAllocate(&*rit), std::move(epc));
+      }
+
+      body.pop_front();
+      body.push_front(std::move(epc));
+      decls.erase(rlast.base(), decls.end());
+    }
+  }
+
+  void MakeExecutableAllocateFromOmps(
+      std::list<parser::OpenMPDeclarativeConstruct> &omps,
+      parser::Block &body) {
+    using OpenMPDeclarativeConstruct = parser::OpenMPDeclarativeConstruct;
+
+    std::list<OpenMPDeclarativeConstruct>::reverse_iterator rlast = [&]() {
+      for (auto rit = omps.rbegin(), rend = omps.rend(); rit != rend; ++rit) {
+        if (!std::holds_alternative<parser::OmpAllocateDirective>(rit->u)) {
+          return rit;
         }
       }
+      return omps.rend();
+    }();
+
+    if (rlast != omps.rbegin()) {
+      parser::ExecutionPartConstruct epc(std::move(body.front()));
+      for (auto rit = omps.rbegin(); rit != rlast; ++rit) {
+        epc = EmbedInExec(
+            &std::get<parser::OmpAllocateDirective>(rit->u), std::move(epc));
+      }
+
+      body.pop_front();
+      body.push_front(std::move(epc));
+      omps.erase(rlast.base(), omps.end());
     }
   }
 
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 3ea8e5b8cd2b0..d7db15dd37949 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -179,29 +179,21 @@ void OmpStructureChecker::Leave(const parser::BlockConstruct &x) {
   }
 }
 
-// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
-#define CHECK_SIMPLE_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-  }
+void OmpStructureChecker::Enter(const parser::SpecificationPart &) {
+  partStack_.push_back(PartKind::SpecificationPart);
+}
 
-#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-    RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \
-  }
+void OmpStructureChecker::Leave(const parser::SpecificationPart &) {
+  partStack_.pop_back();
+}
 
-#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
-    CheckAllowedClause(llvm::omp::Clause::Y); \
-    RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \
-  }
+void OmpStructureChecker::Enter(const parser::ExecutionPart &) {
+  partStack_.push_back(PartKind::ExecutionPart);
+}
 
-// Use when clause don't falls under 'struct OmpClause' in 'parse-tree.h'.
-#define CHECK_SIMPLE_PARSER_CLAUSE(X, Y) \
-  void OmpStructureChecker::Enter(const parser::X &) { \
-    CheckAllowedClause(llvm::omp::Y); \
-  }
+void OmpStructureChecker::Leave(const parser::ExecutionPart &) {
+  partStack_.pop_back();
+}
 
 // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment
 // statements and the expressions enclosed in an OpenMP Workshare construct
@@ -720,18 +712,10 @@ template <typename Checker> struct DirectiveSpellingVisitor {
     return std::get<parser::OmpBeginDirective>(t).DirName();
   }
 
-  bool Pre(const parser::OpenMPDeclarativeAllocate &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate);
-    return false;
-  }
   bool Pre(const parser::OpenMPDispatchConstruct &x) {
     checker_(GetDirName(x.t).source, Directive::OMPD_dispatch);
     return false;
   }
-  bool Pre(const parser::OpenMPExecutableAllocate &x) {
-    checker_(std::get<parser::Verbatim>(x.t).source, Directive::OMPD_allocate);
-    return false;
-  }
   bool Pre(const parser::OpenMPAllocatorsConstruct &x) {
     checker_(GetDirName(x.t).source, Directive::OMPD_allocators);
     return false;
@@ -1667,11 +1651,6 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
-static std::pair<const parser::AllocateStmt *, parser::CharBlock>
-getAllocateStmtAndSource(const parser::Statement<parser::AllocateStmt> &stmt) {
-  return {&stmt.statement, stmt.source};
-}
-
 static std::pair<const parser::AllocateStmt *, parser::CharBlock>
 getAllocateStmtAndSource(const parser::ExecutionPartConstruct *epc) {
   if (SourcedActionStmt as{GetActionStmt(epc)}) {
@@ -1699,19 +1678,12 @@ static UnorderedSymbolSet GetNonComponentSymbols(
   return symbols;
 }
 
-static const parser::OmpObjectList &GetObjectsOrEmpty(
-    const std::optional<parser::OmpObjectList> &maybeObjects) {
-  static parser::OmpObjectList empty{std::list<parser::OmpObject>{}};
-  if (maybeObjects) {
-    return *maybeObjects;
-  }
-  return empty;
-}
+void OmpStructureChecker::CheckIndividualAllocateDirective(
+    const parser::OmpAllocateDirective &x, bool isExecutable) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
 
-void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
-    const parser::OmpObjectList &objects,
-    const parser::OmpClauseList &clauses) {
-  const Scope &thisScope{context_.FindScope(source)};
+  const Scope &thisScope{context_.FindScope(dirName.source)};
 
   auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) {
     // Return "true" if the ALLOCATOR clause was provided with an argument
@@ -1740,7 +1712,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
   const auto *allocator{[&]() {
     // Can't use FindClause in Enter (because clauses haven't been visited
     // yet).
-    for (const parser::OmpClause &c : clauses.v) {
+    for (const parser::OmpClause &c : beginSpec.Clauses().v) {
       if (c.Id() == llvm::omp::Clause::OMPC_allocator) {
         return &c;
       }
@@ -1752,7 +1724,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
     bool hasDynAllocators{
         HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)};
     if (!allocator && !hasDynAllocators) {
-      context_.Say(source,
+      context_.Say(dirName.source,
           "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US);
     }
   }
@@ -1766,7 +1738,7 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
           : "a named common block or has SAVE attribute"};
 
   auto checkSymbol{[&](const Symbol &symbol, parser::CharBlock source) {
-    if (!inExecutableAllocate_) {
+    if (!isExecutable) {
       // For structure members, the scope is the derived type, which is
       // never "this" scope. Ignore this check for members, they will be
       // flagged anyway.
@@ -1802,37 +1774,130 @@ void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
     }
   }};
 
-  for (const parser::OmpObject &object : objects.v) {
-    parser::CharBlock objSource{[&]() {
-      if (auto &&maybeSource{GetObjectSource(object)}) {
-        return *maybeSource;
-      }
-      return source;
-    }()};
-    if (const Symbol *symbol{GetObjectSymbol(object)}) {
+  for (const parser::OmpArgument &arg : beginSpec.Arguments().v) {
+    const parser::OmpObject *object{GetArgumentObject(arg)};
+    if (!object) {
+      context_.Say(arg.source,
+          "An argument to ALLOCATE directive must be a variable list item"_err_en_US);
+      continue;
+    }
+
+    if (const Symbol *symbol{GetObjectSymbol(*object)}) {
       if (!IsTypeParamInquiry(*symbol)) {
-        checkSymbol(*symbol, objSource);
+        checkSymbol(*symbol, arg.source);
       }
-      CheckVarIsNotPartOfAnotherVar(source, object);
+      CheckVarIsNotPartOfAnotherVar(dirName.source, *object);
     }
   }
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+void OmpStructureChecker::CheckExecutableAllocateDirective(
+    const parser::OmpAllocateDirective &x) {
+  parser::omp::OmpAllocateInfo info{SplitOmpAllocate(x)};
+
+  auto [allocStmt, allocSource]{getAllocateStmtAndSource(info.body)};
+  if (!allocStmt) {
+    // This has been diagnosed already.
+    return;
+  }
 
-  if (!inExecutableAllocate_) {
-    const auto &dir{std::get<parser::Verbatim>(x.t)};
-    const auto &clauses{std::get<parser::OmpClauseList>(x.t)};
-    const auto &objects{
-        GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))};
+  UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)};
+  SymbolSourceMap directiveSyms;
+  bool hasEmptyList{false};
+
+  for (const parser::OmpAllocateDirective *ompAlloc : info.dirs) {
+    const parser::OmpDirectiveSpecification &spec{DEREF(ompAlloc).BeginDir()};
+    if (spec.Arguments().v.empty()) {
+      if (hasEmptyList && info.dirs.size() > 1) {
+        context_.Say(spec.DirName().source,
+            "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US);
+      }
+      hasEmptyList = true;
+    }
+    for (const parser::OmpArgument &arg : spec.Arguments().v) {
+      if (auto *sym{GetArgumentSymbol(arg)}) {
+        // Ignore these checks for structure members. They are not allowed
+        // in the first place, so don't tell the users that they need to
+        // be specified somewhere,
+        if (IsStructureComponent(*sym)) {
+          continue;
+        }
+        if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) {
+          parser::MessageFormattedText txt(
+              "A list item on an executable ALLOCATE may only be specified once"_err_en_US);
+          parser::Message message(arg.source, txt);
+          message.Attach(f->second, "The list item was specified here"_en_US);
+          context_.Say(std::move(message));
+        } else {
+          directiveSyms.insert(std::make_pair(sym, arg.source));
+        }
 
-    CheckAllocateDirective(dir.source, objects, clauses);
+        if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) {
+          context_
+              .Say(arg.source,
+                  "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US)
+              .Attach(allocSource, "The ALLOCATE statement"_en_US);
+        }
+      }
+    }
   }
 }
 
-void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
+void OmpStructureChecker::Enter(const parser::OmpAllocateDirective &x) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
+  PushContextAndClauseSets(dirName.source, dirName.v);
+  ++allocateDirectiveLevel;
+
+  bool isExecutable{partStack_.back() == PartKind::ExecutionPart};
+
+  unsigned version{context_.langOptions().OpenMPVersion};
+  if (isExecutable && allocateDirectiveLevel == 1 && version >= 52) {
+    context_.Warn(common::UsageWarning::OpenMPUsage, dirName.source,
+        "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
+  }
+
+  CheckIndividualAllocateDirective(x, isExecutable);
+
+  if (isExecutable) {
+    auto isOmpAllocate{[](const parser::ExecutionPartConstruct &epc) {
+      if (auto *omp{GetOmp(epc)}) {
+        auto odn{GetOmpDirectiveName(*omp)};
+        return odn.v == llvm::omp::Directive::OMPD_allocate;
+      }
+      return false;
+    }};
+
+    auto &body{std::get<parser::Block>(x.t)};
+    // The parser should put at most one statement in the body.
+    assert(body.size() <= 1 && "Multiple statements in allocate");
+    if (body.empty()) {
+      context_.Say(dirName.source,
+          "An executable ALLOCATE directive must be associated with an ALLOCATE statement"_err_en_US);
+    } else {
+      const parser::ExecutionPartConstruct &first{body.front()};
+      auto [allocStmt, _]{getAllocateStmtAndSource(&body.front())};
+      if (!isOmpAllocate(first) && !allocStmt) {
+        parser::CharBlock source{[&]() {
+          if (auto &&maybeSource{parser::GetSource(first)}) {
+            return *maybeSource;
+          }
+          return dirName.source;
+        }()};
+        context_.Say(source,
+            "The statement associated with executable ALLOCATE directive must be an ALLOCATE statement"_err_en_US);
+      }
+    }
+  }
+}
+
+void OmpStructureChecker::Leave(const parser::OmpAllocateDirective &x) {
+  bool isExecutable{partStack_.back() == PartKind::ExecutionPart};
+  if (isExecutable && allocateDirectiveLevel == 1) {
+    CheckExecutableAllocateDirective(x);
+  }
+
+  --allocateDirectiveLevel;
   dirContext_.pop_back();
 }
 
@@ -2135,112 +2200,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::At &x) {
   }
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
-  inExecutableAllocate_ = true;
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
-
-  unsigned version{context_.langOptions().OpenMPVersion};
-  if (version >= 52) {
-    context_.Warn(common::UsageWarning::OpenMPUsage, x.source,
-        "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
-  }
-
-  auto &objects{
-      GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))};
-  auto &clauses{std::get<parser::OmpClauseList>(x.t)};
-
-  CheckAllocateDirective(
-      std::get<parser::Verbatim>(x.t).source, objects, clauses);
-
-  if (const auto &subDirs{
-          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-              x.t)}) {
-    for (const auto &dalloc : *subDirs) {
-      const auto &dir{std::get<parser::Verbatim>(x.t)};
-      const auto &clauses{std::get<parser::OmpClauseList>(dalloc.t)};
-      const auto &objects{GetObjectsOrEmpty(
-          std::get<std::optional<parser::OmpObjectList>>(dalloc.t))};
-      CheckAllocateDirective(dir.source, objects, clauses);
-    }
-  }
-}
-
-void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) {
-  auto [allocStmt, allocSource]{getAllocateStmtAndSource(
-      std::get<parser::Statement<parser::AllocateStmt>>(x.t))};
-
-  UnorderedSymbolSet allocateSyms{GetNonComponentSymbols(*allocStmt)};
-  SymbolSourceMap directiveSyms;
-  auto &objects{
-      GetObjectsOrEmpty(std::get<std::optional<parser::OmpObjectList>>(x.t))};
-  auto emptyListCount{static_cast<size_t>(objects.v.empty())};
-  auto checkObjects{[&](const parser::OmpObjectList &objects,
-                        parser::CharBlock dirSource,
-                        parser::CharBlock allocSource) {
-    for (const parser::OmpObject &object : objects.v) {
-      parser::CharBlock objSource{[&]() {
-        if (auto &&maybeSource{GetObjectSource(object)}) {
-          return *maybeSource;
-        }
-        return dirSource;
-      }()};
-      if (auto *sym{GetObjectSymbol(object)}) {
-        // Ignore these checks for structure members. They are not allowed
-        // in the first place, so don't tell the users that they nened to
-        // be specified somewhere,
-        if (IsStructureComponent(*sym)) {
-          continue;
-        }
-        if (auto f{directiveSyms.find(sym)}; f != directiveSyms.end()) {
-          parser::MessageFormattedText txt(
-              "A list item on an executable ALLOCATE may only be specified once"_err_en_US);
-          parser::Message message(objSource, txt);
-          message.Attach(f->second, "The list item was specified here"_en_US);
-          context_.Say(std::move(message));
-        } else {
-          directiveSyms.insert(std::make_pair(sym, objSource));
-        }
-
-        if (auto f{allocateSyms.find(*sym)}; f == allocateSyms.end()) {
-          context_
-              .Say(objSource,
-                  "A list item on an executable ALLOCATE must be specified on the associated ALLOCATE statement"_err_en_US)
-              .Attach(allocSource, "The ALLOCATE statement"_en_US);
-        }
-      }
-    }
-  }};
-
-  checkObjects(objects, std::get<parser::Verbatim>(x.t).source, allocSource);
-
-  const auto &subDirs{
-      std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
-          x.t)};
-  if (!subDirs) {
-    inExecutableAllocate_ = false;
-    dirContext_.pop_back();
-    return;
-  }
-
-  for (const parser::OpenMPDeclarativeAllocate &ompAlloc : *subDirs) {
-    parser::CharBlock dirSource{std::get<parser::Verbatim>(ompAlloc.t).source};
-    auto &objects{GetObjectsOrEmpty(
-        std::get<std::optional<parser::OmpObjectList>>(ompAlloc.t))};
-    if (objects.v.empty()) {
-      // Only show the message once per construct.
-      if (++emptyListCount == 2 && subDirs->size() >= 1) {
-        context_.Say(dirSource,
-            "If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items"_err_en_US);
-      }
-    }
-    checkObjects(objects, dirSource, allocSource);
-  }
-
-  inExecutableAllocate_ = false;
-  dirContext_.pop_back();
-}
-
 void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
   const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
   const parser::OmpDirectiveName &dirName{beginSpec.DirName()};
@@ -3408,88 +3367,6 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Sizes &c) {
         /*paramName=*/"parameter", /*allowZero=*/false);
 }
 
-// Following clauses do not have a separate node in parse-tree.h.
-CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent)
-CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity)
-CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture)
-CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains)
-CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
-CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
-CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
-CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
-CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
-CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
-CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
-CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
-CHECK_SIMPLE_CLAUSE(Full, OMPC_full)
-CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
-CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id)
-CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset)
-CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
-CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
-CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer)
-CHECK_SIMPLE_CLAUSE(Match, OMPC_match)
-CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal)
-CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks)
-CHECK_SIMPLE_CLAUSE(Order, OMPC_order)
-CHECK_SIMPLE_CLAUSE(Read, OMPC_read)
-CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate)
-CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate)
-CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads)
-CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset)
-CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch)
-CHECK_SIMPLE_CLAUSE(Link, OMPC_link)
-CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect)
-CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable)
-CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp)
-CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines)
-CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs)
-CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism)
-CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup)
-CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch)
-CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial)
-CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
-CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd)
-CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
-CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform)
-CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown)
-CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied)
-CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators)
-CHECK_SIMPLE_CLAUSE(Write, OMPC_write)
-CHECK_SIMPLE_CLAUSE(Init, OMPC_init)
-CHECK_SIMPLE_CLAUSE(Use, OMPC_use)
-CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants)
-CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext)
-CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity)
-CHECK_SIMPLE_CLAUSE(Message, OMPC_message)
-CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
-CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise)
-CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args)
-CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args)
-CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order)
-CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind)
-CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare)
-CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
-CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak)
-CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel)
-CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire)
-CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed)
-CHECK_SIMPLE_CLAUSE(Release, OMPC_release)
-CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable)
-CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent)
-CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst)
-CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
-
-CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams)
-CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads)
-CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem)
-CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority)
-CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit)
-
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
-CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
-
 void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) {
   context_.Say(GetContext().clauseSource,
       "LOOPRANGE clause is not implemented yet"_err_en_US,
@@ -5562,4 +5439,104 @@ void OmpStructureChecker::CheckAllowedRequiresClause(llvmOmpClause clause) {
   }
 }
 
+// Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
+#define CHECK_SIMPLE_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+  }
+
+#define CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+    RequiresConstantPositiveParameter(llvm::omp::Clause::Y, c.v); \
+  }
+
+#define CHECK_REQ_SCALAR_INT_CLAUSE(X, Y) \
+  void OmpStructureChecker::Enter(const parser::OmpClause::X &c) { \
+    CheckAllowedClause(llvm::omp::Clause::Y); \
+    RequiresPositiveParameter(llvm::omp::Clause::Y, c.v); \
+  }
+
+// Following clauses do not have a separate node in parse-tree.h.
+CHECK_SIMPLE_CLAUSE(Absent, OMPC_absent)
+CHECK_SIMPLE_CLAUSE(AcqRel, OMPC_acq_rel)
+CHECK_SIMPLE_CLAUSE(Acquire, OMPC_acquire)
+CHECK_SIMPLE_CLAUSE(AdjustArgs, OMPC_adjust_args)
+CHECK_SIMPLE_CLAUSE(Affinity, OMPC_affinity)
+CHECK_SIMPLE_CLAUSE(AppendArgs, OMPC_append_args)
+CHECK_SIMPLE_CLAUSE(Bind, OMPC_bind)
+CHECK_SIMPLE_CLAUSE(Capture, OMPC_capture)
+CHECK_SIMPLE_CLAUSE(Compare, OMPC_compare)
+CHECK_SIMPLE_CLAUSE(Contains, OMPC_contains)
+CHECK_SIMPLE_CLAUSE(Default, OMPC_default)
+CHECK_SIMPLE_CLAUSE(Depobj, OMPC_depobj)
+CHECK_SIMPLE_CLAUSE(DeviceType, OMPC_device_type)
+CHECK_SIMPLE_CLAUSE(DistSchedule, OMPC_dist_schedule)
+CHECK_SIMPLE_CLAUSE(DynGroupprivate, OMPC_dyn_groupprivate)
+CHECK_SIMPLE_CLAUSE(Exclusive, OMPC_exclusive)
+CHECK_SIMPLE_CLAUSE(Fail, OMPC_fail)
+CHECK_SIMPLE_CLAUSE(Filter, OMPC_filter)
+CHECK_SIMPLE_CLAUSE(Final, OMPC_final)
+CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush)
+CHECK_SIMPLE_CLAUSE(Full, OMPC_full)
+CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize)
+CHECK_SIMPLE_CLAUSE(GraphId, OMPC_graph_id)
+CHECK_SIMPLE_CLAUSE(GraphReset, OMPC_graph_reset)
+CHECK_SIMPLE_CLAUSE(Groupprivate, OMPC_groupprivate)
+CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds)
+CHECK_SIMPLE_CLAUSE(Inbranch, OMPC_inbranch)
+CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive)
+CHECK_SIMPLE_CLAUSE(Indirect, OMPC_indirect)
+CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer)
+CHECK_SIMPLE_CLAUSE(Init, OMPC_init)
+CHECK_SIMPLE_CLAUSE(Link, OMPC_link)
+CHECK_SIMPLE_CLAUSE(Match, OMPC_match)
+CHECK_SIMPLE_CLAUSE(MemoryOrder, OMPC_memory_order)
+CHECK_SIMPLE_CLAUSE(Mergeable, OMPC_mergeable)
+CHECK_SIMPLE_CLAUSE(Message, OMPC_message)
+CHECK_SIMPLE_CLAUSE(Nocontext, OMPC_nocontext)
+CHECK_SIMPLE_CLAUSE(Nogroup, OMPC_nogroup)
+CHECK_SIMPLE_CLAUSE(Nontemporal, OMPC_nontemporal)
+CHECK_SIMPLE_CLAUSE(NoOpenmpConstructs, OMPC_no_openmp_constructs)
+CHECK_SIMPLE_CLAUSE(NoOpenmp, OMPC_no_openmp)
+CHECK_SIMPLE_CLAUSE(NoOpenmpRoutines, OMPC_no_openmp_routines)
+CHECK_SIMPLE_CLAUSE(NoParallelism, OMPC_no_parallelism)
+CHECK_SIMPLE_CLAUSE(Notinbranch, OMPC_notinbranch)
+CHECK_SIMPLE_CLAUSE(Novariants, OMPC_novariants)
+CHECK_SIMPLE_CLAUSE(NumTasks, OMPC_num_tasks)
+CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
+CHECK_SIMPLE_CLAUSE(Order, OMPC_order)
+CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise)
+CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial)
+CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
+CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
+CHECK_SIMPLE_CLAUSE(Read, OMPC_read)
+CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed)
+CHECK_SIMPLE_CLAUSE(Release, OMPC_release)
+CHECK_SIMPLE_CLAUSE(Replayable, OMPC_replayable)
+CHECK_SIMPLE_CLAUSE(SeqCst, OMPC_seq_cst)
+CHECK_SIMPLE_CLAUSE(Severity, OMPC_severity)
+CHECK_SIMPLE_CLAUSE(Simd, OMPC_simd)
+CHECK_SIMPLE_CLAUSE(Threadprivate, OMPC_threadprivate)
+CHECK_SIMPLE_CLAUSE(Threadset, OMPC_threadset)
+CHECK_SIMPLE_CLAUSE(Threads, OMPC_threads)
+CHECK_SIMPLE_CLAUSE(Transparent, OMPC_transparent)
+CHECK_SIMPLE_CLAUSE(Uniform, OMPC_uniform)
+CHECK_SIMPLE_CLAUSE(Unknown, OMPC_unknown)
+CHECK_SIMPLE_CLAUSE(Untied, OMPC_untied)
+CHECK_SIMPLE_CLAUSE(Use, OMPC_use)
+CHECK_SIMPLE_CLAUSE(UsesAllocators, OMPC_uses_allocators)
+CHECK_SIMPLE_CLAUSE(Weak, OMPC_weak)
+CHECK_SIMPLE_CLAUSE(Write, OMPC_write)
+
+CHECK_REQ_SCALAR_INT_CLAUSE(NumTeams, OMPC_num_teams)
+CHECK_REQ_SCALAR_INT_CLAUSE(NumThreads, OMPC_num_threads)
+CHECK_REQ_SCALAR_INT_CLAUSE(OmpxDynCgroupMem, OMPC_ompx_dyn_cgroup_mem)
+CHECK_REQ_SCALAR_INT_CLAUSE(Priority, OMPC_priority)
+CHECK_REQ_SCALAR_INT_CLAUSE(ThreadLimit, OMPC_thread_limit)
+
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Collapse, OMPC_collapse)
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Safelen, OMPC_safelen)
+CHECK_REQ_CONSTANT_SCALAR_INT_CLAUSE(Simdlen, OMPC_simdlen)
+
 } // namespace Fortran::semantics
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index 6feb1d149c4fd..1b84bc5dda471 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -82,6 +82,11 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   bool Enter(const parser::BlockConstruct &);
   void Leave(const parser::BlockConstruct &);
 
+  void Enter(const parser::SpecificationPart &);
+  void Leave(const parser::SpecificationPart &);
+  void Enter(const parser::ExecutionPart &);
+  void Leave(const parser::ExecutionPart &);
+
   void Enter(const parser::OpenMPConstruct &);
   void Leave(const parser::OpenMPConstruct &);
   void Enter(const parser::OpenMPInteropConstruct &);
@@ -113,8 +118,8 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   void Leave(const parser::OmpDeclareVariantDirective &);
   void Enter(const parser::OpenMPDeclareSimdConstruct &);
   void Leave(const parser::OpenMPDeclareSimdConstruct &);
-  void Enter(const parser::OpenMPDeclarativeAllocate &);
-  void Leave(const parser::OpenMPDeclarativeAllocate &);
+  void Enter(const parser::OmpAllocateDirective &);
+  void Leave(const parser::OmpAllocateDirective &);
   void Enter(const parser::OpenMPDeclareMapperConstruct &);
   void Leave(const parser::OpenMPDeclareMapperConstruct &);
   void Enter(const parser::OpenMPDeclareReductionConstruct &);
@@ -129,8 +134,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   void Leave(const parser::OmpErrorDirective &);
   void Enter(const parser::OmpNothingDirective &);
   void Leave(const parser::OmpNothingDirective &);
-  void Enter(const parser::OpenMPExecutableAllocate &);
-  void Leave(const parser::OpenMPExecutableAllocate &);
   void Enter(const parser::OpenMPAllocatorsConstruct &);
   void Leave(const parser::OpenMPAllocatorsConstruct &);
   void Enter(const parser::OpenMPRequiresConstruct &);
@@ -263,9 +266,9 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   bool CheckTargetBlockOnlyTeams(const parser::Block &);
   void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock);
   void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock);
-  void CheckAllocateDirective(parser::CharBlock source,
-      const parser::OmpObjectList &objects,
-      const parser::OmpClauseList &clauses);
+  void CheckIndividualAllocateDirective(
+      const parser::OmpAllocateDirective &x, bool isExecutable);
+  void CheckExecutableAllocateDirective(const parser::OmpAllocateDirective &x);
 
   void CheckIteratorRange(const parser::OmpIteratorSpecifier &x);
   void CheckIteratorModifier(const parser::OmpIterator &x);
@@ -373,7 +376,7 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   };
   int directiveNest_[LastType + 1] = {0};
 
-  bool inExecutableAllocate_{false};
+  int allocateDirectiveLevel{0};
   parser::CharBlock visitedAtomicSource_;
   SymbolSourceMap deferredNonVariables_;
 
@@ -382,6 +385,14 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   std::vector<LoopConstruct> loopStack_;
   // Scopes for scoping units.
   std::vector<const Scope *> scopeStack_;
+
+  enum class PartKind : int {
+    // There are also other "parts", such as internal-subprogram-part, etc,
+    // but we're keeping track of these two for now.
+    SpecificationPart,
+    ExecutionPart,
+  };
+  std::vector<PartKind> partStack_;
 };
 
 /// Find a duplicate entry in the range, and return an iterator to it.
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 03c8cb0065fd8..deb57e005a352 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -415,6 +415,18 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
     return true;
   }
 
+  bool Pre(const parser::SpecificationPart &) {
+    partStack_.push_back(PartKind::SpecificationPart);
+    return true;
+  }
+  void Post(const parser::SpecificationPart &) { partStack_.pop_back(); }
+
+  bool Pre(const parser::ExecutionPart &) {
+    partStack_.push_back(PartKind::ExecutionPart);
+    return true;
+  }
+  void Post(const parser::ExecutionPart &) { partStack_.pop_back(); }
+
   bool Pre(const parser::InternalSubprogram &) {
     // Clear the labels being tracked in the previous scope
     ClearLabels();
@@ -639,8 +651,7 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPThreadprivate &);
   void Post(const parser::OpenMPThreadprivate &) { PopContext(); }
 
-  bool Pre(const parser::OpenMPDeclarativeAllocate &);
-  void Post(const parser::OpenMPDeclarativeAllocate &) { PopContext(); }
+  bool Pre(const parser::OmpAllocateDirective &);
 
   bool Pre(const parser::OpenMPAssumeConstruct &);
   void Post(const parser::OpenMPAssumeConstruct &) { PopContext(); }
@@ -651,9 +662,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OpenMPDispatchConstruct &);
   void Post(const parser::OpenMPDispatchConstruct &) { PopContext(); }
 
-  bool Pre(const parser::OpenMPExecutableAllocate &);
-  void Post(const parser::OpenMPExecutableAllocate &);
-
   bool Pre(const parser::OpenMPAllocatorsConstruct &);
   void Post(const parser::OpenMPAllocatorsConstruct &);
 
@@ -998,6 +1006,14 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
       targetLabels_;
   parser::CharBlock currentStatementSource_;
 
+  enum class PartKind : int {
+    // There are also other "parts", such as internal-subprogram-part, etc,
+    // but we're keeping track of these two for now.
+    SpecificationPart,
+    ExecutionPart,
+  };
+  std::vector<PartKind> partStack_;
+
   void AddAllocateName(const parser::Name *&object) {
     allocateNames_.push_back(object);
   }
@@ -2558,11 +2574,24 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) {
   return true;
 }
 
-bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclarativeAllocate &x) {
+bool OmpAttributeVisitor::Pre(const parser::OmpAllocateDirective &x) {
   PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
-  if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
-    ResolveOmpObjectList(*list, Symbol::Flag::OmpDeclarativeAllocateDirective);
+  assert(!partStack_.empty() && "Misplaced directive");
+
+  auto ompFlag{partStack_.back() == PartKind::SpecificationPart
+          ? Symbol::Flag::OmpDeclarativeAllocateDirective
+          : Symbol::Flag::OmpExecutableAllocateDirective};
+
+  parser::omp::OmpAllocateInfo info{parser::omp::SplitOmpAllocate(x)};
+  for (const parser::OmpAllocateDirective *ad : info.dirs) {
+    for (const parser::OmpArgument &arg : ad->BeginDir().Arguments().v) {
+      if (auto *object{omp::GetArgumentObject(arg)}) {
+        ResolveOmpObject(*object, ompFlag);
+      }
+    }
   }
+
+  PopContext();
   return false;
 }
 
@@ -2581,15 +2610,6 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDispatchConstruct &x) {
   return true;
 }
 
-bool OmpAttributeVisitor::Pre(const parser::OpenMPExecutableAllocate &x) {
-  PushContext(x.source, llvm::omp::Directive::OMPD_allocate);
-  const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (list) {
-    ResolveOmpObjectList(*list, Symbol::Flag::OmpExecutableAllocateDirective);
-  }
-  return true;
-}
-
 bool OmpAttributeVisitor::Pre(const parser::OpenMPAllocatorsConstruct &x) {
   const parser::OmpDirectiveSpecification &dirSpec{x.BeginDir()};
   PushContext(x.source, dirSpec.DirId());
@@ -2661,10 +2681,6 @@ bool OmpAttributeVisitor::IsNestedInDirective(llvm::omp::Directive directive) {
   return false;
 }
 
-void OmpAttributeVisitor::Post(const parser::OpenMPExecutableAllocate &x) {
-  PopContext();
-}
-
 void OmpAttributeVisitor::Post(const parser::OpenMPAllocatorsConstruct &x) {
   PopContext();
 }
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 220f1c96b9823..a2062ef28d52c 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1700,12 +1700,12 @@ class OmpVisitor : public virtual DeclarationVisitor {
   void Post(const parser::OpenMPDeclareTargetConstruct &) {
     SkipImplicitTyping(false);
   }
-  bool Pre(const parser::OpenMPDeclarativeAllocate &x) {
+  bool Pre(const parser::OmpAllocateDirective &x) {
     AddOmpSourceRange(x.source);
     SkipImplicitTyping(true);
     return true;
   }
-  void Post(const parser::OpenMPDeclarativeAllocate &) {
+  void Post(const parser::OmpAllocateDirective &) {
     SkipImplicitTyping(false);
     messageHandler().set_currStmtSource(std::nullopt);
   }
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 59af58ddcd32e..27097193aaa9b 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -1171,6 +1171,45 @@ attributes(device) pure integer(8) function atomicaddl(address, val)
     integer(8), intent(inout) :: address
     integer(8), value :: val
     end function
+    attributes(device) pure integer(4) function atomicaddr2(address, val)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(2), dimension(2), intent(inout) :: address
+      real(2), dimension(2), intent(in) :: val
+    end function
+  end interface
+
+  interface atomicaddvector
+    attributes(device) pure function atomicaddvector_r2x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(2), dimension(2), intent(inout) :: address
+      real(2), dimension(2), intent(in) :: val
+      real(2), dimension(2) :: z
+    end function
+
+    attributes(device) pure function atomicaddvector_r4x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(2), intent(inout) :: address
+      real(4), dimension(2), intent(in) :: val
+      real(4), dimension(2) :: z
+    end function
+  end interface
+
+  interface atomicaddreal4x2
+    attributes(device) pure function atomicadd_r4x2(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(2), intent(inout) :: address
+      real(4), dimension(2), intent(in) :: val
+      real(4), dimension(2) :: z
+    end function
+  end interface
+
+  interface atomicaddreal4x4
+    attributes(device) pure function atomicadd_r4x4(address, val) result(z)
+      !dir$ ignore_tkr (rd) address, (d) val
+      real(4), dimension(4), intent(inout) :: address
+      real(4), dimension(4), intent(in) :: val
+      real(4), dimension(4) :: z
+    end function
   end interface
 
   interface atomicsub
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtbegin.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtend.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crti.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o b/flang/test/Driver/Inputs/fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13/crtn.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/flang/test/Driver/gcc-triple.f90 b/flang/test/Driver/gcc-triple.f90
new file mode 100644
index 0000000000000..324311febf157
--- /dev/null
+++ b/flang/test/Driver/gcc-triple.f90
@@ -0,0 +1,18 @@
+!! UNSUPPORTED: system-windows
+
+!! Test that --gcc-triple option is working as expected.
+
+! RUN: %flang --target=x86_64-linux-gnu -v --sysroot=%S/Inputs/fedora_39_tree 2>&1 | FileCheck %s --dump-input=always --check-prefix=DEFAULT_TRIPLE
+! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13
+! DEFAULT_TRIPLE: {{^}}Found candidate GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13
+! DEFAULT_TRIPLE: {{^}}Selected GCC installation:
+! DEFAULT_TRIPLE: fedora_39_tree/usr/lib/gcc/x86_64-linux-gnu/13
+
+! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-redhat-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_EXISTS
+! TRIPLE_EXISTS: {{^}}Selected GCC installation:
+! TRIPLE_EXISTS: fedora_39_tree/usr/lib/gcc/x86_64-redhat-linux/13
+
+! RUN: %flang -v --sysroot=%S/Inputs/fedora_39_tree --gcc-triple=x86_64-foo-linux 2>&1 | FileCheck %s --check-prefix=TRIPLE_DOES_NOT_EXISTS
+! TRIPLE_DOES_NOT_EXISTS-NOT: x86_64-foo-linux
\ No newline at end of file
diff --git a/flang/test/Lower/CUDA/cuda-atomicadd.cuf b/flang/test/Lower/CUDA/cuda-atomicadd.cuf
new file mode 100644
index 0000000000000..6669b4afa291d
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-atomicadd.cuf
@@ -0,0 +1,35 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran atmoicadd functions available cudadevice module
+
+attributes(global) subroutine test_atomicaddvector_r2()
+  real(2), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicAddVector(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r2() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16>
+
+attributes(global) subroutine test_atomicaddvector_r4()
+  real(4), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicAddVector(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicaddvector_r4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32>
+
+attributes(global) subroutine test_atomicadd_r2x4()
+  real(4), device :: a(2), tmp1(2), tmp2(2)
+  tmp1 = atomicaddreal4x2(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicadd_r2x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf32>
+
+attributes(global) subroutine test_atomicadd_r4x4()
+  real(4), device :: a(4), tmp1(4), tmp2(4)
+  tmp1 = atomicaddreal4x4(a, tmp2)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_atomicadd_r4x4() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
+! CHECK: llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<4xf32>
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 09b4302446ee7..2d2c801b48f4d 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -14,6 +14,8 @@ attributes(global) subroutine devsub()
   integer :: smalltime
   integer(4) :: res, offset
   integer(8) :: resl
+  real(2) :: r2a(2)
+  real(2) :: tmp2(2)
 
   integer :: tid
   tid = threadIdx%x
@@ -34,6 +36,7 @@ attributes(global) subroutine devsub()
   al = atomicadd(al, 1_8)
   af = atomicadd(af, 1.0_4)
   ad = atomicadd(ad, 1.0_8)
+  ai = atomicadd(r2a, tmp2)
   
   ai = atomicsub(ai, 1_4)
   al = atomicsub(al, 1_8)
@@ -128,6 +131,7 @@ end
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f64
+! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, vector<2xf16>
 
 ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw sub  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
@@ -436,11 +440,11 @@ end subroutine
 
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: %{{.*}} = nvvm.mbarrier.arrive.shared %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
+! CHECK: %{{.*}} = nvvm.mbarrier.arrive %[[SHARED_PTR]] : !llvm.ptr<3> -> i64
 
 ! CHECK: %[[LLVM_PTR:.*]] = fir.convert %[[DECL_SHARED]]#0 : (!fir.ref<i64>) -> !llvm.ptr
 ! CHECK: %[[SHARED_PTR:.*]] = llvm.addrspacecast %[[LLVM_PTR]] : !llvm.ptr to !llvm.ptr<3>
-! CHECK: nvvm.mbarrier.arrive.expect_tx %[[SHARED_PTR]], %{{.*}} : !llvm.ptr<3>, i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %{{.*}}, [%{{.*}}], %{{.*}};" ro(%{{.*}}, %{{.*}} : !llvm.ptr<3>, i32) -> i64
 
 
 attributes(global) subroutine test_fence()
@@ -490,7 +494,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_s2g
 ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(device) subroutine testAtomicCasLoop(aa, n)
@@ -515,7 +519,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait()
 ! CHECK: scf.while
-! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %{{.*}}, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %c1000000{{.*}} : !llvm.ptr, i64, i32) -> i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A  .reg .pred p;\0A  mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A  selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
 
 attributes(global) subroutine test_barrier_try_wait_sleep()
   integer :: istat
@@ -526,7 +530,7 @@ attributes(global) subroutine test_barrier_try_wait_sleep()
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_barrier_try_wait_sleep()
-! CHECK: %{{.*}} = nvvm.inline_ptx ".reg .pred p; mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}}; selp.b32 %0, 1, 0, p;" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
+! CHECK: %{{.*}} = nvvm.inline_ptx "{\0A  .reg .pred p;\0A  mbarrier.try_wait.shared.b64 p, [%{{.*}}], %{{.*}}, %{{.*}};\0A  selp.b32 %{{.*}}, 1, 0, p;\0A}" ro(%{{.*}}, %{{.*}}, %{{.*}} : !llvm.ptr, i64, i32) -> i32
 
 attributes(global) subroutine test_tma_bulk_load_c4(a, n)
   integer(8), shared :: barrier1
@@ -671,7 +675,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_c8(c, n)
@@ -684,7 +688,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_c8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_i4(c, n)
@@ -697,7 +701,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_i8(c, n)
@@ -710,7 +714,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_i8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 
@@ -724,7 +728,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r2
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_r4(c, n)
@@ -737,7 +741,7 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r4
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
 
 attributes(global) subroutine test_tma_bulk_store_r8(c, n)
@@ -750,5 +754,5 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_tma_bulk_store_r8
 ! CHECK: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
-! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group"
+! CHECK: nvvm.inline_ptx "cp.async.bulk.commit_group;"
 ! CHECK: nvvm.cp.async.bulk.wait_group 0
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
index 8daf20e1ae400..fec146ac70313 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate-align.f90
@@ -5,6 +5,6 @@
 program main
   integer :: x
 
-  ! CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  ! CHECK: not yet implemented: OmpAllocateDirective
   !$omp allocate(x) align(32)
 end
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90 b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
index e83b433d0fda0..3307eb2505b71 100644
--- a/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
+++ b/flang/test/Lower/OpenMP/Todo/omp-declarative-allocate.f90
@@ -5,6 +5,6 @@
 program main
   integer :: x, y
 
-  ! CHECK: not yet implemented: OpenMPDeclarativeAllocate
+  ! CHECK: not yet implemented: OmpAllocateDirective
   !$omp allocate(x, y)
 end
diff --git a/flang/test/Parser/OpenMP/allocate-align-tree.f90 b/flang/test/Parser/OpenMP/allocate-align-tree.f90
index 0d247cd1ed945..d799aa10a82ff 100644
--- a/flang/test/Parser/OpenMP/allocate-align-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-align-tree.f90
@@ -16,27 +16,33 @@ program allocate_align_tree
     allocate(j(z), xarray(t))
 end program allocate_align_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
-!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!CHECK-NEXT: | | | AttrSpec -> Allocatable
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'j'
+!CHECK:      DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!CHECK-NEXT: | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!CHECK-NEXT: | AttrSpec -> Allocatable
+!CHECK-NEXT: | EntityDecl
+!CHECK-NEXT: | | Name = 'j'
 
+!CHECK:      ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'j'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4'
+!CHECK-NEXT: | | | LiteralConstant -> IntLiteralConstant = '16'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4'
+!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '32'
+!CHECK-NEXT: | | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
 
-!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '32_4'
-!CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '32'
-!CHECK-NEXT: | | | OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
-!CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'j'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Align -> OmpAlignClause -> Scalar -> Integer -> Constant -> Expr = '16_4'
-!CHECK-NEXT: | | | | | LiteralConstant -> IntLiteralConstant = '16'
-!CHECK-NEXT: | | | AllocateStmt
+!UNPARSE:      !$OMP ALLOCATE(j) ALIGN(16_4)
+!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALIGN(32_4) ALLOCATOR(2_8)
+!UNPARSE-NEXT:  ALLOCATE(j(z), xarray(t))
 
-!UNPARSE: !$OMP ALLOCATE (j) ALIGN(16_4)
-!UNPARSE: !$OMP ALLOCATE (xarray) ALIGN(32_4) ALLOCATOR(2_8)
-!UNPARSE-NEXT: ALLOCATE(j(z), xarray(t))
diff --git a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90 b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
index afcaf44b09f03..800e4a57d5f0e 100644
--- a/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
+++ b/flang/test/Parser/OpenMP/allocate-tree-spec-part.f90
@@ -17,33 +17,48 @@ program allocate_tree
     allocate (w, xarray(4), zarray(5, f))
 end program allocate_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'f'
-!CHECK-NEXT: | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | Designator -> DataRef -> Name =
+!CHECK:      | | DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'f'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
 !CHECK-NEXT: | ExecutionPart -> Block
 !CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'f=2_4'
 !CHECK-NEXT: | | | Variable = 'f'
 !CHECK-NEXT: | | | | Designator -> DataRef -> Name = 'f'
 !CHECK-NEXT: | | | Expr = '2_4'
 !CHECK-NEXT: | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpClauseList ->
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | AllocateStmt
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | | | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | | | | | Flags = None
+!CHECK-NEXT: | | | | | Block
+!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
+!CHECK-NEXT: | | | | | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | | | | | Flags = None
+!CHECK-NEXT: | | | | | | | Block
+!CHECK-NEXT: | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | | | | | OmpClauseList ->
+!CHECK-NEXT: | | | | | | | | | | Flags = None
+!CHECK-NEXT: | | | | | | | | | Block
+!CHECK-NEXT: | | | | | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
diff --git a/flang/test/Parser/OpenMP/allocate-tree.f90 b/flang/test/Parser/OpenMP/allocate-tree.f90
index bf413d591baf2..021d8104a7e62 100644
--- a/flang/test/Parser/OpenMP/allocate-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-tree.f90
@@ -7,52 +7,54 @@
 
 program allocate_tree
     use omp_lib
-    integer, allocatable :: w, xarray(:), zarray(:, :)
-    integer :: z, t
+    integer, allocatable :: xarray(:), zarray(:, :)
+    integer :: z, t, w
+!$omp allocate(w) allocator(omp_const_mem_alloc)
     t = 2
     z = 3
-!$omp allocate(w) allocator(omp_const_mem_alloc)
 !$omp allocate(xarray) allocator(omp_large_cap_mem_alloc)
 !$omp allocate(zarray) allocator(omp_default_mem_alloc)
 !$omp allocate
-    allocate(w, xarray(4), zarray(t, z))
+    allocate(xarray(4), zarray(t, z))
 end program allocate_tree
 
-!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
-!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!CHECK-NEXT: | | | AttrSpec -> Allocatable
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'w'
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'xarray'
-!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '1'
-!CHECK-NEXT: | | | EntityDecl
-!CHECK-NEXT: | | | | Name = 'zarray'
-!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '2'
-
+!CHECK:      DeclarationConstruct -> SpecificationConstruct -> OpenMPDeclarativeConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'w'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '3_8'
+!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_const_mem_alloc'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
 
-!CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
-!CHECK-NEXT: | | | Verbatim
-!CHECK-NEXT: | | | OmpClauseList ->
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'w'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | OpenMPDeclarativeAllocate
-!CHECK-NEXT: | | | | Verbatim
-!CHECK-NEXT: | | | | OmpObjectList -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
-!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr =
-!CHECK-NEXT: | | | | | Designator -> DataRef -> Name =
-!CHECK-NEXT: | | | AllocateStmt
+!CHECK:      ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | OmpBeginDirective
+!CHECK-NEXT: | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'xarray'
+!CHECK-NEXT: | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '2_8'
+!CHECK-NEXT: | | | Designator -> DataRef -> Name = 'omp_large_cap_mem_alloc'
+!CHECK-NEXT: | | Flags = None
+!CHECK-NEXT: | Block
+!CHECK-NEXT: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | OmpBeginDirective
+!CHECK-NEXT: | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | OmpArgumentList -> OmpArgument -> OmpLocator -> OmpObject -> Designator -> DataRef -> Name = 'zarray'
+!CHECK-NEXT: | | | | OmpClauseList -> OmpClause -> Allocator -> Scalar -> Integer -> Expr = '1_8'
+!CHECK-NEXT: | | | | | Designator -> DataRef -> Name = 'omp_default_mem_alloc'
+!CHECK-NEXT: | | | | Flags = None
+!CHECK-NEXT: | | | Block
+!CHECK-NEXT: | | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OmpAllocateDirective
+!CHECK-NEXT: | | | | | OmpBeginDirective
+!CHECK-NEXT: | | | | | | OmpDirectiveName -> llvm::omp::Directive = allocate
+!CHECK-NEXT: | | | | | | OmpClauseList ->
+!CHECK-NEXT: | | | | | | Flags = None
+!CHECK-NEXT: | | | | | Block
+!CHECK-NEXT: | | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
 
-!UNPARSE: !$OMP ALLOCATE (w) ALLOCATOR(3_8)
-!UNPARSE-NEXT: !$OMP ALLOCATE (xarray) ALLOCATOR(2_8)
-!UNPARSE-NEXT: !$OMP ALLOCATE (zarray) ALLOCATOR(1_8)
+!UNPARSE:      !$OMP ALLOCATE(w) ALLOCATOR(3_8)
+!UNPARSE-NEXT:   t=2_4
+!UNPARSE-NEXT:   z=3_4
+!UNPARSE-NEXT: !$OMP ALLOCATE(xarray) ALLOCATOR(2_8)
+!UNPARSE-NEXT: !$OMP ALLOCATE(zarray) ALLOCATOR(1_8)
 !UNPARSE-NEXT: !$OMP ALLOCATE
-!UNPARSE-NEXT: ALLOCATE(w, xarray(4_4), zarray(t,z))
+!UNPARSE-NEXT:  ALLOCATE(xarray(4_4), zarray(t,z))
diff --git a/flang/test/Parser/OpenMP/allocate-unparse.f90 b/flang/test/Parser/OpenMP/allocate-unparse.f90
index 63cc35cd55082..b61a97150cad2 100644
--- a/flang/test/Parser/OpenMP/allocate-unparse.f90
+++ b/flang/test/Parser/OpenMP/allocate-unparse.f90
@@ -34,19 +34,19 @@ program allocate_unparse
 end program allocate_unparse
 
 !CHECK:!$OMP ALLOCATE{{[ ]*$}}
-!CHECK:!$OMP ALLOCATE (x,y)
-!CHECK:!$OMP ALLOCATE (x,y) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (a,b)
+!CHECK:!$OMP ALLOCATE(x, y)
+!CHECK:!$OMP ALLOCATE(x, y) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(a, b)
 !CHECK:ALLOCATE(darray(a,b))
 !CHECK:!$OMP ALLOCATE ALLOCATOR(omp_default_mem_alloc)
 !CHECK:ALLOCATE(darray(a,b))
-!CHECK:!$OMP ALLOCATE (a,b) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(a, b) ALLOCATOR(omp_default_mem_alloc)
 !CHECK:ALLOCATE(darray(a,b))
-!CHECK:!$OMP ALLOCATE (t) ALLOCATOR(omp_const_mem_alloc)
-!CHECK:!$OMP ALLOCATE (z) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (m) ALLOCATOR(omp_default_mem_alloc)
-!CHECK:!$OMP ALLOCATE (n)
-!CHECK:!$OMP ALLOCATE (j) ALIGN(16)
+!CHECK:!$OMP ALLOCATE(t) ALLOCATOR(omp_const_mem_alloc)
+!CHECK:!$OMP ALLOCATE(z) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(m) ALLOCATOR(omp_default_mem_alloc)
+!CHECK:!$OMP ALLOCATE(n)
+!CHECK:!$OMP ALLOCATE(j) ALIGN(16)
 !CHECK:ALLOCATE(darray(z,t))
 !CHECK:!$OMP ALLOCATE{{[ ]*$}}
 !CHECK:ALLOCATE(darray(a,b))
diff --git a/flang/test/Parser/OpenMP/nested-directive.f90 b/flang/test/Parser/OpenMP/nested-directive.f90
new file mode 100644
index 0000000000000..2a10bbe666bb8
--- /dev/null
+++ b/flang/test/Parser/OpenMP/nested-directive.f90
@@ -0,0 +1,7 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s 2>&1 | FileCheck %s --match-full-lines
+
+subroutine func
+  implicit none
+! CHECK: !$OMP NOTHING
+  !$omp nothing !$omp Cannot nest directives inside directives; must be interpreted as a comment
+end subroutine func
diff --git a/flang/test/Semantics/OpenMP/allocate-align01.f90 b/flang/test/Semantics/OpenMP/allocate-align01.f90
index 88bcd6d2f1008..4a1e60cf73fff 100644
--- a/flang/test/Semantics/OpenMP/allocate-align01.f90
+++ b/flang/test/Semantics/OpenMP/allocate-align01.f90
@@ -11,9 +11,9 @@ program allocate_align_tree
     integer :: z, t, xx
     t = 2
     z = 3
+    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !ERROR: Must be a constant value
 !$omp allocate(j) align(xx)
-    !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !ERROR: The alignment should be positive
 !$omp allocate(xarray) align(-32) allocator(omp_large_cap_mem_alloc)
     allocate(j(z), xarray(t))
diff --git a/flang/test/Semantics/OpenMP/allocate-directive.f90 b/flang/test/Semantics/OpenMP/allocate-directive.f90
index 18a14b825f00d..e34125b392bda 100644
--- a/flang/test/Semantics/OpenMP/allocate-directive.f90
+++ b/flang/test/Semantics/OpenMP/allocate-directive.f90
@@ -11,7 +11,7 @@
 integer, allocatable :: a, b, m, n, t, z
 !$omp allocate(x, y)
 !$omp allocate(x, y) allocator(omp_default_mem_alloc)
-
+    continue
 !$omp allocate(a, b)
     allocate ( a, b )
 
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 229fd4d6c3f95..5fe4efdd106d9 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -17,7 +17,7 @@ subroutine sema()
 
     !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
     !$omp allocate(y)
-        print *, a
+    print *, a
 
     !WARNING: The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead [-Wopen-mp-usage]
     !$omp allocate(x) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate02.f90 b/flang/test/Semantics/OpenMP/allocate02.f90
index 8f0579e810bb9..a1e684796edb2 100644
--- a/flang/test/Semantics/OpenMP/allocate02.f90
+++ b/flang/test/Semantics/OpenMP/allocate02.f90
@@ -16,6 +16,7 @@ subroutine allocate()
   !ERROR: At most one ALLOCATOR clause can appear on the ALLOCATE directive
   !$omp allocate(x, y) allocator(omp_default_mem_alloc) allocator(omp_default_mem_alloc)
 
+  continue
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
       allocate ( darray(a, b) )
 
diff --git a/flang/test/Semantics/OpenMP/allocate03.f90 b/flang/test/Semantics/OpenMP/allocate03.f90
index e35115f3897cc..3609f38eb6ee7 100644
--- a/flang/test/Semantics/OpenMP/allocate03.f90
+++ b/flang/test/Semantics/OpenMP/allocate03.f90
@@ -17,6 +17,7 @@ subroutine allocate()
 
   !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive
   !$omp allocate(my_var%array)
+  continue
 
   !ERROR: A variable that is part of another variable (as an array or structure element) cannot appear on the ALLOCATE directive
   !$omp allocate(darray, my_var%array) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index 9b57322bbadc6..272094aaaeec2 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -13,7 +13,7 @@ subroutine allocate()
 
   !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
-
+  continue
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
     allocate(darray(a, b))
 
diff --git a/flang/test/Semantics/OpenMP/allocate10.f90 b/flang/test/Semantics/OpenMP/allocate10.f90
index a9db7330296ba..0a9e85b8ae2fe 100644
--- a/flang/test/Semantics/OpenMP/allocate10.f90
+++ b/flang/test/Semantics/OpenMP/allocate10.f90
@@ -4,8 +4,8 @@ subroutine f00
   integer, allocatable :: x, y
 
   continue
-  !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items
   !$omp allocate
+  !ERROR: If multiple directives are present in an executable ALLOCATE directive, at most one of them may specify no list items
   !$omp allocate
   allocate(x, y)
 end
diff --git a/flang/test/Semantics/OpenMP/allocate12.f90 b/flang/test/Semantics/OpenMP/allocate12.f90
new file mode 100644
index 0000000000000..2b3b510fbf40c
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/allocate12.f90
@@ -0,0 +1,16 @@
+!RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=51
+
+subroutine f00
+  integer, allocatable :: x
+  continue
+  !ERROR: An executable ALLOCATE directive must be associated with an ALLOCATE statement
+  !$omp allocate(x)
+end
+
+subroutine f01
+  integer, allocatable :: x
+  continue
+  !$omp allocate(x)
+  !ERROR: The statement associated with executable ALLOCATE directive must be an ALLOCATE statement
+  continue
+end
diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index ae555a256ba66..4e6b4195a9c5e 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -47,8 +47,6 @@ set(LIBC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 set(LIBC_ENABLE_USE_BY_CLANG OFF CACHE BOOL "Whether or not to place libc in a build directory findable by a just built clang")
 
-set(LIBC_KERNEL_HEADERS "/usr/include" CACHE STRING "Path to Linux kernel headers")
-
 # Defining a global namespace to enclose all libc functions.
 set(default_namespace "__llvm_libc")
 if(LLVM_VERSION_MAJOR)
@@ -146,6 +144,11 @@ option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless
 
 option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF)
 
+if(LIBC_TARGET_OS_IS_LINUX)
+  set(kernel_headers "/usr/include")
+endif()
+set(LIBC_KERNEL_HEADERS "${kernel_headers}" CACHE STRING "Path to Linux kernel headers")
+
 set(LIBC_ENABLE_UNITTESTS ON)
 set(LIBC_ENABLE_HERMETIC_TESTS ${LLVM_LIBC_FULL_BUILD})
 
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 714120a79e39a..e0dd15b803253 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -325,8 +325,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
-    # Disabled while SYS_faccessat2 is unavailable on the buildbot.
-    # libc.src.unistd.faccessat
+    libc.src.unistd.faccessat
     libc.src.unistd.fchdir
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
diff --git a/libc/config/linux/aarch64/exclude.txt b/libc/config/linux/aarch64/exclude.txt
new file mode 100644
index 0000000000000..f2f553f78933c
--- /dev/null
+++ b/libc/config/linux/aarch64/exclude.txt
@@ -0,0 +1,8 @@
+include(CheckSymbolExists)
+check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
+if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
+  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
+    libc.src.unistd.faccessat
+  )
+endif()
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index f6bbb346d10e5..0d031d8844f13 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -329,6 +329,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.dup2
     libc.src.unistd.dup3
     libc.src.unistd.execve
+    libc.src.unistd.faccessat
     libc.src.unistd.fchdir
     libc.src.unistd.fpathconf
     libc.src.unistd.fsync
diff --git a/libc/config/linux/riscv/exclude.txt b/libc/config/linux/riscv/exclude.txt
new file mode 100644
index 0000000000000..f2f553f78933c
--- /dev/null
+++ b/libc/config/linux/riscv/exclude.txt
@@ -0,0 +1,8 @@
+include(CheckSymbolExists)
+check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
+if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
+  list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
+    libc.src.unistd.faccessat
+  )
+endif()
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 7a8d74a4e5da9..a44e2041e57f2 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -326,6 +326,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     # unistd.h entrypoints
     libc.src.unistd.access
     libc.src.unistd.chdir
+    libc.src.unistd.chown
     libc.src.unistd.close
     libc.src.unistd.dup
     libc.src.unistd.dup2
@@ -344,6 +345,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.getppid
     libc.src.unistd.getsid
     libc.src.unistd.gettid
+    libc.src.unistd.getgid
     libc.src.unistd.getuid
     libc.src.unistd.isatty
     libc.src.unistd.link
diff --git a/libc/config/linux/x86_64/exclude.txt b/libc/config/linux/x86_64/exclude.txt
index a0686310d21ac..31b60a9c3497c 100644
--- a/libc/config/linux/x86_64/exclude.txt
+++ b/libc/config/linux/x86_64/exclude.txt
@@ -23,6 +23,7 @@ endif()
 include(CheckSymbolExists)
 check_symbol_exists(SYS_faccessat2 "sys/syscall.h" HAVE_SYS_FACCESSAT2)
 if(NOT HAVE_SYS_FACCESSAT2)
+  message(VERBOSE "unistd.faccessat excluded from build, faccessat2 syscall is not available on the system")
   list(APPEND TARGET_LLVMLIBC_REMOVED_ENTRYPOINTS
     libc.src.unistd.faccessat
   )
diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt
index 225843924c243..433c47b174766 100644
--- a/libc/hdr/types/CMakeLists.txt
+++ b/libc/hdr/types/CMakeLists.txt
@@ -479,3 +479,11 @@ add_proxy_header_library(
     libc.include.llvm-libc-types.struct_rlimit
     libc.include.sys_resource
 )
+
+add_proxy_header_library(
+  gid_t
+  HDRS
+    gid_t.h
+  FULL_BUILD_DEPENDS
+    libc.include.llvm-libc-types.gid_t
+)
diff --git a/libc/hdr/types/gid_t.h b/libc/hdr/types/gid_t.h
new file mode 100644
index 0000000000000..bc274aaa9a8a8
--- /dev/null
+++ b/libc/hdr/types/gid_t.h
@@ -0,0 +1,22 @@
+//===-- Proxy for gid_t ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_HDR_TYPES_GID_T_H
+#define LLVM_LIBC_HDR_TYPES_GID_T_H
+
+#ifdef LIBC_FULL_BUILD
+
+#include "include/llvm-libc-types/gid_t.h"
+
+#else // Overlay mode
+
+#include <sys/types.h>
+
+#endif // LLVM_LIBC_FULL_BUILD
+
+#endif // LLVM_LIBC_HDR_TYPES_GID_T_H
diff --git a/libc/include/unistd.yaml b/libc/include/unistd.yaml
index 2ff86eafaf550..0e5b22e627b67 100644
--- a/libc/include/unistd.yaml
+++ b/libc/include/unistd.yaml
@@ -3,6 +3,7 @@ header_template: unistd.h.def
 macros: []
 types:
   - type_name: uid_t
+  - type_name: gid_t
   - type_name: ssize_t
   - type_name: size_t
   - type_name: pid_t
@@ -54,6 +55,14 @@ functions:
     return_type: int
     arguments:
       - type: const char *
+  - name: chown
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: uid_t
+      - type: gid_t
   - name: close
     standards:
       - POSIX
@@ -195,6 +204,12 @@ functions:
     return_type: uid_t
     arguments:
       - type: void
+  - name: getgid
+    standards:
+      - POSIX
+    return_type: gid_t
+    arguments:
+      - type: void
   - name: isatty
     standards:
       - POSIX
diff --git a/libc/src/__support/float_to_string.h b/libc/src/__support/float_to_string.h
index cab146a5b8698..9115ed2856881 100644
--- a/libc/src/__support/float_to_string.h
+++ b/libc/src/__support/float_to_string.h
@@ -700,7 +700,11 @@ template <> class FloatToString<long double> {
 
       const int SHIFT_AMOUNT = FLOAT_AS_INT_WIDTH + exponent;
       static_assert(EXTRA_INT_WIDTH >= sizeof(long double) * 8);
-      float_as_fixed <<= SHIFT_AMOUNT;
+      if (SHIFT_AMOUNT > 0) {
+        float_as_fixed <<= SHIFT_AMOUNT;
+      } else {
+        float_as_fixed >>= -SHIFT_AMOUNT;
+      }
 
       // If there are still digits above the decimal point, handle those.
       if (cpp::countl_zero(float_as_fixed) <
@@ -769,7 +773,7 @@ template <> class FloatToString<long double> {
     // The decimal representation of 2**(-i) will have exactly i digits after
     // the decimal point.
     const int num_requested_digits =
-        static_cast<int>((negative_block_index + 1) * BLOCK_SIZE);
+        static_cast<int>(negative_block_index * BLOCK_SIZE);
 
     return num_requested_digits > -exponent;
   }
diff --git a/libc/src/unistd/CMakeLists.txt b/libc/src/unistd/CMakeLists.txt
index 78c3bf8442fab..337480cbbf928 100644
--- a/libc/src/unistd/CMakeLists.txt
+++ b/libc/src/unistd/CMakeLists.txt
@@ -27,6 +27,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.chdir
 )
 
+add_entrypoint_object(
+  chown
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.chown
+)
+
 add_entrypoint_object(
   close
   ALIAS
@@ -160,6 +167,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.getuid
 )
 
+add_entrypoint_object(
+  getgid
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.getgid
+)
+
 add_entrypoint_object(
   isatty
   ALIAS
diff --git a/libc/src/unistd/chown.h b/libc/src/unistd/chown.h
new file mode 100644
index 0000000000000..84a8eba2cb2e6
--- /dev/null
+++ b/libc/src/unistd/chown.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for chown -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UNISTD_CHOWN_H
+#define LLVM_LIBC_SRC_UNISTD_CHOWN_H
+
+#include "hdr/types/gid_t.h"
+#include "hdr/types/uid_t.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int chown(const char *path, uid_t owner, gid_t group);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UNISTD_CHOWN_H
diff --git a/libc/src/unistd/getgid.h b/libc/src/unistd/getgid.h
new file mode 100644
index 0000000000000..eed0b20d688b1
--- /dev/null
+++ b/libc/src/unistd/getgid.h
@@ -0,0 +1,22 @@
+//===-- Implementation header for getgid ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_UNISTD_GETGID_H
+#define LLVM_LIBC_SRC_UNISTD_GETGID_H
+
+#include "hdr/types/gid_t.h"
+#include "hdr/unistd_macros.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+gid_t getgid();
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_UNISTD_GETGID_H
diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
index 4eb3c7d3d7fae..c2dacc6456e27 100644
--- a/libc/src/unistd/linux/CMakeLists.txt
+++ b/libc/src/unistd/linux/CMakeLists.txt
@@ -25,6 +25,20 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  chown
+  SRCS
+    chown.cpp
+  HDRS
+    ../chown.h
+  DEPENDS
+    libc.hdr.types.uid_t
+    libc.hdr.types.gid_t
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   close
   SRCS
@@ -276,6 +290,20 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  getgid
+  SRCS
+    getgid.cpp
+  HDRS
+    ../getgid.h
+  DEPENDS
+    libc.hdr.types.gid_t
+    libc.hdr.fcntl_macros
+    libc.include.unistd
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+)
+
 add_entrypoint_object(
   getuid
   SRCS
diff --git a/libc/src/unistd/linux/chown.cpp b/libc/src/unistd/linux/chown.cpp
new file mode 100644
index 0000000000000..c7bf1703ffe57
--- /dev/null
+++ b/libc/src/unistd/linux/chown.cpp
@@ -0,0 +1,29 @@
+//===-- Linux implementation of chown -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/chown.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, chown, (const char *path, uid_t owner, gid_t group)) {
+  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_chown, path, owner, group);
+  if (ret < 0) {
+    libc_errno = -ret;
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/unistd/linux/getgid.cpp b/libc/src/unistd/linux/getgid.cpp
new file mode 100644
index 0000000000000..1656fd601d843
--- /dev/null
+++ b/libc/src/unistd/linux/getgid.cpp
@@ -0,0 +1,23 @@
+//===-- Linux implementation of getgid ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/unistd/getgid.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(gid_t, getgid, ()) {
+  return LIBC_NAMESPACE::syscall_impl<gid_t>(SYS_getgid);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/UnitTest/FEnvSafeTest.cpp b/libc/test/UnitTest/FEnvSafeTest.cpp
index 4393f9d5e5c3b..64f50d7be7fe3 100644
--- a/libc/test/UnitTest/FEnvSafeTest.cpp
+++ b/libc/test/UnitTest/FEnvSafeTest.cpp
@@ -43,7 +43,8 @@ void FEnvSafeTest::set_fenv(const fenv_t &fenv) {
 
 void FEnvSafeTest::expect_fenv_eq(const fenv_t &before_fenv,
                                   const fenv_t &after_fenv) {
-#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC)
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) && !defined(LIBC_COMPILER_IS_MSVC) && \
+    defined(__ARM_FP)
   using FPState = LIBC_NAMESPACE::fputil::FEnv::FPState;
   const FPState &before_state = reinterpret_cast<const FPState &>(before_fenv);
   const FPState &after_state = reinterpret_cast<const FPState &>(after_fenv);
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index f1b545ba546f9..42fdd59cf4d9c 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -1537,6 +1537,14 @@ TEST(LlvmLibcSPrintfTest, FloatDecimalLongDoubleConv) {
 #if defined(LIBC_TYPES_LONG_DOUBLE_IS_X86_FLOAT80)
 
 #ifndef LIBC_COPT_FLOAT_TO_STR_REDUCED_PRECISION
+  written = LIBC_NAMESPACE::sprintf(
+      buff, "%.75Lf",
+      0.0833333333333333333355920878593448009041821933351457118988037109375L);
+  ASSERT_STREQ_LEN(written, buff,
+                   "0."
+                   "08333333333333333333559208785934480090418219333514571189880"
+                   "3710937500000000");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L);
   ASSERT_STREQ_LEN(written, buff,
                    "99999999999999999996693535322073426194986990198284960792713"
@@ -2976,6 +2984,10 @@ TEST(LlvmLibcSPrintfTest, FloatAutoLongDoubleConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xf.fffffffffffffffp+16380L);
   ASSERT_STREQ_LEN(written, buff, "1.18973e+4932");
 
+  // Minimum normal
+  written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 3.36210314311209350626E-4932L);
+  ASSERT_STREQ_LEN(written, buff, "3.3621e-4932");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%Lg", 0xa.aaaaaaaaaaaaaabp-7L);
   ASSERT_STREQ_LEN(written, buff, "0.0833333");
 
diff --git a/libc/test/src/unistd/CMakeLists.txt b/libc/test/src/unistd/CMakeLists.txt
index 44f28fff9ad39..07070535459ec 100644
--- a/libc/test/src/unistd/CMakeLists.txt
+++ b/libc/test/src/unistd/CMakeLists.txt
@@ -36,6 +36,26 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
+add_libc_unittest(
+  chown_test
+  SUITE
+    libc_unistd_unittests
+  SRCS
+    chown_test.cpp
+  DEPENDS
+    libc.hdr.fcntl_macros
+    libc.include.unistd
+    libc.src.errno.errno
+    libc.src.unistd.chown
+    libc.src.unistd.close
+    libc.src.unistd.unlink
+    libc.src.fcntl.open
+    libc.src.unistd.getuid
+    libc.src.unistd.getgid
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
+
 add_libc_unittest(
   dup_test
   SUITE
@@ -437,6 +457,16 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoCheckingTest
 )
 
+add_libc_unittest(
+  getgid_test
+  SUITE
+    libc_unistd_unittests
+  SRCS
+    getgid_test.cpp
+  DEPENDS
+    libc.src.unistd.getgid
+)
+
 add_libc_unittest(
   getpid_test
   SUITE
diff --git a/libc/test/src/unistd/chown_test.cpp b/libc/test/src/unistd/chown_test.cpp
new file mode 100644
index 0000000000000..8b1f783273624
--- /dev/null
+++ b/libc/test/src/unistd/chown_test.cpp
@@ -0,0 +1,51 @@
+//===-- Unittests for chown -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/fcntl/open.h"
+#include "src/unistd/chown.h"
+#include "src/unistd/close.h"
+#include "src/unistd/getgid.h"
+#include "src/unistd/getuid.h"
+#include "src/unistd/unlink.h"
+
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+#include "hdr/fcntl_macros.h"
+#include <sys/stat.h>
+
+using LlvmLibcChownTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcChownTest, ChownSuccess) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+  uid_t my_uid = LIBC_NAMESPACE::getuid();
+  gid_t my_gid = LIBC_NAMESPACE::getgid();
+  constexpr const char *FILENAME = "chown.test";
+  auto TEST_FILE = libc_make_test_file_path(FILENAME);
+
+  // Create a test file.
+  int write_fd = LIBC_NAMESPACE::open(TEST_FILE, O_WRONLY | O_CREAT, S_IRWXU);
+  ASSERT_ERRNO_SUCCESS();
+  ASSERT_GT(write_fd, 0);
+
+  // Change the ownership of the file.
+  ASSERT_THAT(LIBC_NAMESPACE::chown(TEST_FILE, my_uid, my_gid), Succeeds(0));
+
+  // Close the file descriptor.
+  ASSERT_THAT(LIBC_NAMESPACE::close(write_fd), Succeeds(0));
+
+  // Clean up the test file.
+  ASSERT_THAT(LIBC_NAMESPACE::unlink(TEST_FILE), Succeeds(0));
+}
+
+TEST_F(LlvmLibcChownTest, ChownNonExistentFile) {
+  using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+  ASSERT_THAT(LIBC_NAMESPACE::chown("non-existent-file", 1000, 1000),
+              Fails(ENOENT));
+}
diff --git a/llvm/lib/BinaryFormat/Minidump.cpp b/libc/test/src/unistd/getgid_test.cpp
similarity index 51%
rename from llvm/lib/BinaryFormat/Minidump.cpp
rename to libc/test/src/unistd/getgid_test.cpp
index b618fb1570126..77dbad2f18e00 100644
--- a/llvm/lib/BinaryFormat/Minidump.cpp
+++ b/libc/test/src/unistd/getgid_test.cpp
@@ -1,4 +1,4 @@
-//===-- Minidump.cpp - Minidump constants and structures ---------*- C++-*-===//
+//===-- Unittests for getgid ----------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/BinaryFormat/Minidump.h"
+#include "src/unistd/getgid.h"
+#include "test/UnitTest/Test.h"
 
-using namespace llvm::minidump;
-
-constexpr uint32_t Header::MagicSignature;
-constexpr uint16_t Header::MagicVersion;
+TEST(LlvmLibcGetGidTest, SmokeTest) {
+  // getgid always succeeds. So, we just call it as a smoke test.
+  LIBC_NAMESPACE::getgid();
+}
diff --git a/libclc/clc/include/clc/math/clc_cbrt.inc b/libclc/clc/include/clc/math/clc_cbrt.h
similarity index 100%
rename from libclc/clc/include/clc/math/clc_cbrt.inc
rename to libclc/clc/include/clc/math/clc_cbrt.h
diff --git a/libclc/clc/lib/generic/math/clc_cbrt.cl b/libclc/clc/lib/generic/math/clc_cbrt.cl
index 105f6329d5bad..935b7b7eae78c 100644
--- a/libclc/clc/lib/generic/math/clc_cbrt.cl
+++ b/libclc/clc/lib/generic/math/clc_cbrt.cl
@@ -8,6 +8,7 @@
 
 #include <clc/clc_convert.h>
 #include <clc/internal/clc.h>
+#include <clc/math/clc_cbrt.h>
 #include <clc/math/clc_copysign.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_fma.h>
diff --git a/libclc/opencl/lib/generic/math/cbrt.cl b/libclc/opencl/lib/generic/math/cbrt.cl
index 0d670150ed4c9..7de61436522b3 100644
--- a/libclc/opencl/lib/generic/math/cbrt.cl
+++ b/libclc/opencl/lib/generic/math/cbrt.cl
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_cbrt.inc>
+#include <clc/math/clc_cbrt.h>
 #include <clc/opencl/math/cbrt.h>
 
 #define __CLC_FUNCTION cbrt
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index dbe69484abedf..e15c5b1a5d32f 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -451,7 +451,7 @@ Instead use:
 
 .. code-block:: cpp
 
-   // UNSUPPORTED: std-at-least-c++26
+   // REQUIRES: std-at-least-c++26
 
 There is no corresponding ``std-at-most-c++23``. This could be useful when
 tests are only valid for a small set of standard versions. For example, a
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 796fa924be121..aef036a2c9586 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -11,18 +11,24 @@
 
 #include <__config>
 #include <__cstddef/nullptr_t.h>
+#include <__cstddef/size_t.h>
 #include <__exception/operations.h>
 #include <__memory/addressof.h>
 #include <__memory/construct_at.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/is_pointer.h>
-#include <cstdlib>
+#include <__utility/move.h>
+#include <__utility/swap.h>
+#include <__verbose_abort>
 #include <typeinfo>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
 #ifndef _LIBCPP_ABI_MICROSOFT
 
 #  if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION
@@ -30,7 +36,7 @@
 namespace __cxxabiv1 {
 
 extern "C" {
-_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(size_t) throw();
+_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(std::size_t) throw();
 _LIBCPP_OVERRIDABLE_FUNC_VIS void __cxa_free_exception(void*) throw();
 
 struct __cxa_exception;
@@ -57,6 +63,8 @@ _LIBCPP_BEGIN_UNVERSIONED_NAMESPACE_STD
 
 #ifndef _LIBCPP_ABI_MICROSOFT
 
+inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT;
+
 class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
   void* __ptr_;
 
@@ -75,7 +83,15 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
   _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {}
 
   exception_ptr(const exception_ptr&) _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI exception_ptr(exception_ptr&& __other) _NOEXCEPT : __ptr_(__other.__ptr_) {
+    __other.__ptr_ = nullptr;
+  }
   exception_ptr& operator=(const exception_ptr&) _NOEXCEPT;
+  _LIBCPP_HIDE_FROM_ABI exception_ptr& operator=(exception_ptr&& __other) _NOEXCEPT {
+    exception_ptr __tmp(std::move(__other));
+    std::swap(__tmp, *this);
+    return *this;
+  }
   ~exception_ptr() _NOEXCEPT;
 
   _LIBCPP_HIDE_FROM_ABI explicit operator bool() const _NOEXCEPT { return __ptr_ != nullptr; }
@@ -88,10 +104,16 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
     return !(__x == __y);
   }
 
+  friend _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT;
+
   friend _LIBCPP_EXPORTED_FROM_ABI exception_ptr current_exception() _NOEXCEPT;
   friend _LIBCPP_EXPORTED_FROM_ABI void rethrow_exception(exception_ptr);
 };
 
+inline _LIBCPP_HIDE_FROM_ABI void swap(exception_ptr& __x, exception_ptr& __y) _NOEXCEPT {
+  std::swap(__x.__ptr_, __y.__ptr_);
+}
+
 #  if _LIBCPP_HAS_EXCEPTIONS
 #    if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION
 template <class _Ep>
@@ -153,7 +175,7 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
 #  else  // !_LIBCPP_HAS_EXCEPTIONS
 template <class _Ep>
 _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep) _NOEXCEPT {
-  std::abort();
+  _LIBCPP_VERBOSE_ABORT("make_exception_ptr was called in -fno-exceptions mode");
 }
 #  endif // _LIBCPP_HAS_EXCEPTIONS
 
@@ -201,4 +223,6 @@ _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
 #endif // _LIBCPP_ABI_MICROSOFT
 _LIBCPP_END_UNVERSIONED_NAMESPACE_STD
 
+_LIBCPP_POP_MACROS
+
 #endif // _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h
index 9bd3af12c9572..7a7410b5dea08 100644
--- a/libcxx/include/__numeric/saturation_arithmetic.h
+++ b/libcxx/include/__numeric/saturation_arithmetic.h
@@ -30,6 +30,9 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept {
+#  if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101
+  return __builtin_elementwise_add_sat(__x, __y);
+#  else
   if (_Tp __sum; !__builtin_add_overflow(__x, __y, std::addressof(__sum)))
     return __sum;
   // Handle overflow
@@ -44,10 +47,14 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept {
       // Overflows if  (x < 0 && y < 0)
       return std::numeric_limits<_Tp>::min();
   }
+#  endif
 }
 
 template <__signed_or_unsigned_integer _Tp>
 _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept {
+#  if defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 2101
+  return __builtin_elementwise_sub_sat(__x, __y);
+#  else
   if (_Tp __sub; !__builtin_sub_overflow(__x, __y, std::addressof(__sub)))
     return __sub;
   // Handle overflow
@@ -63,6 +70,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept {
       // Overflows if (x < 0 && y > 0)
       return std::numeric_limits<_Tp>::min();
   }
+#  endif
 }
 
 template <__signed_or_unsigned_integer _Tp>
diff --git a/libcxx/include/__type_traits/reference_constructs_from_temporary.h b/libcxx/include/__type_traits/reference_constructs_from_temporary.h
index 3d097ce90cb09..a8325620414ea 100644
--- a/libcxx/include/__type_traits/reference_constructs_from_temporary.h
+++ b/libcxx/include/__type_traits/reference_constructs_from_temporary.h
@@ -18,7 +18,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary)
+#if _LIBCPP_STD_VER >= 23
 
 template <class _Tp, class _Up>
 struct _LIBCPP_NO_SPECIALIZATIONS reference_constructs_from_temporary
diff --git a/libcxx/include/__type_traits/reference_converts_from_temporary.h b/libcxx/include/__type_traits/reference_converts_from_temporary.h
index c68f1765af9d5..9c51225e53b8e 100644
--- a/libcxx/include/__type_traits/reference_converts_from_temporary.h
+++ b/libcxx/include/__type_traits/reference_converts_from_temporary.h
@@ -18,7 +18,7 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_converts_from_temporary)
+#if _LIBCPP_STD_VER >= 23
 
 template <class _Tp, class _Up>
 struct _LIBCPP_NO_SPECIALIZATIONS reference_converts_from_temporary
diff --git a/libcxx/include/exception b/libcxx/include/exception
index 74229cd16c006..0b2372e571e99 100644
--- a/libcxx/include/exception
+++ b/libcxx/include/exception
@@ -93,10 +93,13 @@ template <class E> void rethrow_if_nested(const E& e);
 
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <cstddef>
-#    include <cstdlib>
 #    include <new>
 #    include <type_traits>
 #  endif
+
+#  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 23
+#    include <cstdlib>
+#  endif
 #endif // __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
 
 #endif // _LIBCPP_EXCEPTION
diff --git a/libcxx/include/string b/libcxx/include/string
index 8f80afbc2fd37..ede42467b99fe 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -644,6 +644,7 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #  include <__utility/forward.h>
 #  include <__utility/is_pointer_in_range.h>
 #  include <__utility/move.h>
+#  include <__utility/no_destroy.h>
 #  include <__utility/scope_guard.h>
 #  include <__utility/swap.h>
 #  include <climits>
@@ -914,6 +915,11 @@ private:
   union __rep {
     __short __s;
     __long __l;
+
+    __rep() = default;
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__short __r) : __s(__r) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__long __r) : __l(__r) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 __rep(__uninitialized_tag) {}
   };
 
   _LIBCPP_COMPRESSED_PAIR(__rep, __rep_, allocator_type, __alloc_);
@@ -1206,7 +1212,10 @@ public:
   }
 #  endif // _LIBCPP_CXX03_LANG
 
-  inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(); }
+  // TODO(boomanaiden154): Once we mark this in destructors as dead on return,
+  // we can use a normal call to __reset_internal_buffer and remove the extra
+  // __rep constructor.
+  inline _LIBCPP_CONSTEXPR_SINCE_CXX20 ~basic_string() { __reset_internal_buffer(__rep(__uninitialized_tag())); }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 operator __self_view() const _NOEXCEPT {
     return __self_view(typename __self_view::__assume_valid(), data(), size());
@@ -2259,18 +2268,12 @@ private:
     return __long(__buffer, __capacity);
   }
 
-  // Deallocate the long buffer if it exists and clear the short buffer so we are an empty string
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer() {
+  // Replace the current buffer with __new_rep. Deallocate the old long buffer if it exists.
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __reset_internal_buffer(__rep __new_rep = __short()) {
     __annotate_delete();
     if (__is_long())
       __alloc_traits::deallocate(__alloc_, __get_long_pointer(), __get_long_cap());
-    __rep_.__s = __short();
-  }
-
-  // Replace the current buffer with __alloc; the first __size elements constitute a string
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __replace_internal_buffer(__long __alloc) {
-    __reset_internal_buffer();
-    __rep_.__l = __alloc;
+    __rep_ = __new_rep;
   }
 
   // Initialize the internal buffer to hold __size elements
@@ -2444,7 +2447,7 @@ private:
         __annotate_delete();
         auto __guard = std::__make_scope_guard(__annotate_new_size(*this));
         auto __alloc = __str.__alloc_;
-        __replace_internal_buffer(__allocate_long_buffer(__alloc, __str.size()));
+        __reset_internal_buffer(__allocate_long_buffer(__alloc, __str.size()));
         __alloc_ = std::move(__alloc);
       }
     }
@@ -2710,7 +2713,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::__
                       __sec_cp_sz);
   __buffer.__size_ = __n_copy + __n_add + __sec_cp_sz;
   traits_type::assign(__buffer.__data_[__buffer.__size_], value_type());
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 // __grow_by is deprecated because it does not set the size. It may not update the size when the size is changed, and it
@@ -2746,7 +2749,7 @@ _LIBCPP_DEPRECATED_("use __grow_by_without_replace") basic_string<_CharT, _Trait
   // This is -1 to make sure the caller sets the size properly, since old versions of this function didn't set the size
   // at all.
   __buffer.__size_ = -1;
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -3394,7 +3397,7 @@ _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocator>::re
   __long __buffer  = __allocate_long_buffer(__alloc_, __requested_capacity);
   __buffer.__size_ = size();
   traits_type::copy(std::__to_address(__buffer.__data_), data(), __buffer.__size_ + 1);
-  __replace_internal_buffer(__buffer);
+  __reset_internal_buffer(__buffer);
 }
 
 template <class _CharT, class _Traits, class _Allocator>
@@ -3433,7 +3436,7 @@ inline _LIBCPP_CONSTEXPR_SINCE_CXX20 void basic_string<_CharT, _Traits, _Allocat
     }
 
     traits_type::copy(std::__to_address(__buffer.__data_), std::__to_address(__get_long_pointer()), __size + 1);
-    __replace_internal_buffer(__buffer);
+    __reset_internal_buffer(__buffer);
 #  if _LIBCPP_HAS_EXCEPTIONS
   } catch (...) {
     return;
diff --git a/libcxx/include/tuple b/libcxx/include/tuple
index 466f501b5f4f8..3c5330dd6e14e 100644
--- a/libcxx/include/tuple
+++ b/libcxx/include/tuple
@@ -1443,7 +1443,7 @@ template <class _Tp, class _Tuple, class = enable_if_t<__can_make_from_tuple<_Tp
 inline _LIBCPP_HIDE_FROM_ABI constexpr _Tp make_from_tuple(_Tuple&& __t)
   noexcept(noexcept(std::__make_from_tuple_impl<_Tp>(std::forward<_Tuple>(__t),
                     make_index_sequence<tuple_size_v<remove_reference_t<_Tuple>>>()))) {
-#if _LIBCPP_STD_VER >= 23 && __has_builtin(__reference_constructs_from_temporary)
+#if _LIBCPP_STD_VER >= 23
   if constexpr (tuple_size_v<remove_reference_t<_Tuple>> == 1) {
     static_assert(!std::reference_constructs_from_temporary_v<_Tp, decltype(std::get<0>(std::declval<_Tuple>()))>,
                   "Attempted construction of reference element binds to a temporary whose lifetime has ended");
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 24aaabf0a87df..f608c94d3031e 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -186,99 +186,99 @@ public:
 #        endif
 #      endif
 
-struct __type_info_implementations {
-  struct __string_impl_base {
-    typedef const char* __type_name_t;
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char*
-    __type_name_to_string(__type_name_t __v) _NOEXCEPT {
-      return __v;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t
-    __string_to_type_name(const char* __v) _NOEXCEPT {
-      return __v;
-    }
-  };
+namespace __type_info_implementations {
+struct __string_impl_base {
+  typedef const char* __type_name_t;
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static const char*
+  __type_name_to_string(__type_name_t __v) _NOEXCEPT {
+    return __v;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE _LIBCPP_CONSTEXPR static __type_name_t
+  __string_to_type_name(const char* __v) _NOEXCEPT {
+    return __v;
+  }
+};
 
-  struct __unique_impl : __string_impl_base {
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
-      return reinterpret_cast<size_t>(__v);
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs == __rhs;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs < __rhs;
-    }
-  };
-
-  struct __non_unique_impl : __string_impl_base {
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT {
-      size_t __hash = 5381;
-      while (unsigned char __c = static_cast<unsigned char>(*__ptr++))
-        __hash = (__hash * 33) ^ __c;
-      return __hash;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      return __builtin_strcmp(__lhs, __rhs) < 0;
-    }
-  };
+struct __unique_impl : __string_impl_base {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
+    return reinterpret_cast<size_t>(__v);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs == __rhs;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs < __rhs;
+  }
+};
+
+struct __non_unique_impl : __string_impl_base {
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __ptr) _NOEXCEPT {
+    size_t __hash = 5381;
+    while (unsigned char __c = static_cast<unsigned char>(*__ptr++))
+      __hash = (__hash * 33) ^ __c;
+    return __hash;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __lhs == __rhs || __builtin_strcmp(__lhs, __rhs) == 0;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    return __builtin_strcmp(__lhs, __rhs) < 0;
+  }
+};
 
-  struct __non_unique_arm_rtti_bit_impl {
-    typedef uintptr_t __type_name_t;
+struct __non_unique_arm_rtti_bit_impl {
+  typedef uintptr_t __type_name_t;
 
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT {
-      return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value);
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT {
-      return reinterpret_cast<__type_name_t>(__v);
-    }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static const char* __type_name_to_string(__type_name_t __v) _NOEXCEPT {
+    return reinterpret_cast<const char*>(__v & ~__non_unique_rtti_bit::value);
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static __type_name_t __string_to_type_name(const char* __v) _NOEXCEPT {
+    return reinterpret_cast<__type_name_t>(__v);
+  }
 
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
-      if (__is_type_name_unique(__v))
-        return __v;
-      return __non_unique_impl::__hash(__type_name_to_string(__v));
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      if (__lhs == __rhs)
-        return true;
-      if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
-        // Either both are unique and have a different address, or one of them
-        // is unique and the other one isn't. In both cases they are unequal.
-        return false;
-      return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0;
-    }
-    _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
-      if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
-        return __lhs < __rhs;
-      return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0;
-    }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static size_t __hash(__type_name_t __v) _NOEXCEPT {
+    if (__is_type_name_unique(__v))
+      return __v;
+    return __non_unique_impl::__hash(__type_name_to_string(__v));
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __eq(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    if (__lhs == __rhs)
+      return true;
+    if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
+      // Either both are unique and have a different address, or one of them
+      // is unique and the other one isn't. In both cases they are unequal.
+      return false;
+    return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) == 0;
+  }
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_ALWAYS_INLINE static bool __lt(__type_name_t __lhs, __type_name_t __rhs) _NOEXCEPT {
+    if (__is_type_name_unique(__lhs) || __is_type_name_unique(__rhs))
+      return __lhs < __rhs;
+    return __builtin_strcmp(__type_name_to_string(__lhs), __type_name_to_string(__rhs)) < 0;
+  }
 
-  private:
-    // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when
-    // this implementation is actually used.
-    typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))>
-        __non_unique_rtti_bit;
+private:
+  // The unique bit is the top bit. It is expected that __type_name_t is 64 bits when
+  // this implementation is actually used.
+  typedef integral_constant<__type_name_t, (1ULL << ((__CHAR_BIT__ * sizeof(__type_name_t)) - 1))>
+      __non_unique_rtti_bit;
 
-    _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT {
-      return !(__lhs & __non_unique_rtti_bit::value);
-    }
-  };
+  _LIBCPP_HIDE_FROM_ABI static bool __is_type_name_unique(__type_name_t __lhs) _NOEXCEPT {
+    return !(__lhs & __non_unique_rtti_bit::value);
+  }
+};
 
-  typedef
+typedef
 #      if _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 1
-      __unique_impl
+    __unique_impl
 #      elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 2
-      __non_unique_impl
+    __non_unique_impl
 #      elif _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION == 3
-      __non_unique_arm_rtti_bit_impl
+    __non_unique_arm_rtti_bit_impl
 #      else
 #        error invalid configuration for _LIBCPP_TYPEINFO_COMPARISON_IMPLEMENTATION
 #      endif
-          __impl;
-};
+        __impl;
+} // namespace __type_info_implementations
 
 #      if __has_cpp_attribute(_Clang::__ptrauth_vtable_pointer__)
 #        if __has_feature(ptrauth_type_info_vtable_pointer_discrimination)
diff --git a/libcxx/modules/std/exception.inc b/libcxx/modules/std/exception.inc
index 02b0f80190e5b..3dbc0112c15a0 100644
--- a/libcxx/modules/std/exception.inc
+++ b/libcxx/modules/std/exception.inc
@@ -18,6 +18,7 @@ export namespace std {
   using std::rethrow_exception;
   using std::rethrow_if_nested;
   using std::set_terminate;
+  using std::swap;
   using std::terminate;
   using std::terminate_handler;
   using std::throw_with_nested;
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index d047b29b63cc6..8c3e1f0a97dfe 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -245,7 +245,6 @@ deque stdexcept
 deque tuple
 deque version
 exception cstdint
-exception cstdlib
 exception typeinfo
 exception version
 execution version
diff --git a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
index 897ae89365014..3fac952b9eb98 100644
--- a/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
+++ b/libcxx/test/libcxx/type_traits/no_specializations.verify.cpp
@@ -154,14 +154,10 @@ SPECIALIZE_UTT(is_unbounded_array);     // expected-error 2 {{cannot be speciali
 #  endif
 
 #  if TEST_STD_VER >= 23
-SPECIALIZE_UTT(is_implicit_lifetime); // expected-error 2 {{cannot be specialized}}
-SPECIALIZE_UTT(is_scoped_enum);       // expected-error 2 {{cannot be specialized}}
-#    if __has_builtin(__reference_constructs_from_temporary)
+SPECIALIZE_UTT(is_implicit_lifetime);                // expected-error 2 {{cannot be specialized}}
+SPECIALIZE_UTT(is_scoped_enum);                      // expected-error 2 {{cannot be specialized}}
 SPECIALIZE_BTT(reference_constructs_from_temporary); // expected-error 2 {{cannot be specialized}}
-#    endif
-#    if __has_builtin(__reference_converts_from_temporary)
-SPECIALIZE_BTT(reference_converts_from_temporary); // expected-error 2 {{cannot be specialized}}
-#    endif
+SPECIALIZE_BTT(reference_converts_from_temporary);   // expected-error 2 {{cannot be specialized}}
 #  endif
 
 #  if TEST_STD_VER >= 26
diff --git a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
index 13bd761ae9808..602bd1612015d 100644
--- a/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.generic/cas_non_power_of_2.pass.cpp
@@ -9,7 +9,7 @@
 // https://github.com/llvm/llvm-project/issues/30023
 // compare exchange does not work with types of which the size is not a power of 2
 
-// XFAIL: clang-19, clang-20, clang-21, apple-clang-15, apple-clang-16, apple-clang-17
+// XFAIL: clang-20, clang-21, apple-clang-17
 // UNSUPPORTED: c++03
 
 // TODO: remove the UNSUPPORTED clang-22 once libc++ CI's clang is updated to include
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
index 731d751df08d9..dc4d8ae2851f4 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/sized_delete_array.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
index 64a26ed63e8ce..834c01b2272e2 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/sized_delete.pass.cpp
@@ -11,7 +11,6 @@
 // UNSUPPORTED: c++03, c++11
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(apple-clang-17): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
index 0aded33e660d5..7e25d40dc8a7d 100644
--- a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr.pass.cpp
@@ -14,7 +14,6 @@
 
 #include <exception>
 #include <cassert>
-#include <type_traits>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp
new file mode 100644
index 0000000000000..6882bc6548da3
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_assignment.pass.cpp
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions, c++03
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test the move assignment of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p = std::make_exception_ptr(42);
+  std::exception_ptr p2{p};
+  assert(p2 == p);
+  // Under test: the move assignment
+  std::exception_ptr p3;
+  p3 = std::move(p2);
+  assert(p3 == p);
+// `p2` was moved from. In libc++ it will be nullptr, but
+// this is not guaranteed by the standard.
+#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT)
+  assert(p2 == nullptr);
+  assert(p2 == nullptr);
+#endif
+
+  try {
+    std::rethrow_exception(p3);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp
new file mode 100644
index 0000000000000..122e229fd6e47
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_move_ctr.pass.cpp
@@ -0,0 +1,43 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions, c++03
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test the move constructor of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p = std::make_exception_ptr(42);
+  std::exception_ptr p2{p};
+  assert(p2 == p);
+  // Under test: The move constructor
+  std::exception_ptr p3{std::move(p2)};
+  assert(p3 == p);
+// `p2` was moved from. In libc++ it will be nullptr, but
+// this is not guaranteed by the standard.
+#if defined(_LIBCPP_VERSION) && !defined(_LIBCPP_ABI_MICROSOFT)
+  assert(p2 == nullptr);
+#endif
+
+  try {
+    std::rethrow_exception(p3);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp
new file mode 100644
index 0000000000000..82b4713bed538
--- /dev/null
+++ b/libcxx/test/std/language.support/support.exception/propagation/exception_ptr_swap.pass.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+
+// <exception>
+
+// typedef unspecified exception_ptr;
+
+// Test swapping of exception_ptr
+
+#include <exception>
+#include <utility>
+#include <cassert>
+
+#include "test_macros.h"
+
+int main(int, char**) {
+  std::exception_ptr p21 = std::make_exception_ptr(42);
+  std::exception_ptr p42 = std::make_exception_ptr(21);
+  std::swap(p42, p21);
+
+  try {
+    std::rethrow_exception(p21);
+  } catch (int e) {
+    assert(e == 21);
+  }
+  try {
+    std::rethrow_exception(p42);
+  } catch (int e) {
+    assert(e == 42);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/std/numerics/c.math/signbit.pass.cpp b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
index 7571ced2e4431..233e8ed2338b6 100644
--- a/libcxx/test/std/numerics/c.math/signbit.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/signbit.pass.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: windows
 
 // These compilers don't support constexpr `__builtin_signbit` yet.
-// UNSUPPORTED: clang-19, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // GCC warns about signbit comparing `bool_v < 0`, which we're testing
 // ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-bool-compare
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
index 6bd112c7d1280..f49e19acf0234 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
index bdfc57694dd53..0789213163847 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
index 1fe7916c67823..f09bf30771102 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
index b797ae7533add..86e2e61647be8 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class R, class T>
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
index 8b6188f1fad0e..c2be8c5a47bdf 100644
--- a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++26
 
-// The test uses "Placeholder variables with no name"
-// UNSUPPORTED: apple-clang-16
-
 // <numeric>
 
 // template<class T>
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
index f71f688afc52e..bb0b2d322218d 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/shared_ptr_array.pass.cpp
@@ -10,7 +10,6 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // These compiler versions and platforms don't enable sized deallocation by default.
-// ADDITIONAL_COMPILE_FLAGS(apple-clang-16): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=x86_64-w64-windows-gnu): -fsized-deallocation
 // ADDITIONAL_COMPILE_FLAGS(target=i686-w64-windows-gnu): -fsized-deallocation
 
diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
index 2e0b1e9025a61..b6b226c7f79d8 100644
--- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
 
 // These compilers don't support __builtin_is_virtual_base_of yet.
-// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
index 1ca9d44b82afe..f43693c08bc39 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.verify.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // These compilers don't support __builtin_is_implicit_lifetime yet.
-// UNSUPPORTED: clang-19, gcc-14, apple-clang-16, apple-clang-17
+// UNSUPPORTED: apple-clang-17
 
 // <type_traits>
 
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
index ad53c8176cc92..84fe7cfb02208 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_constructs_from_temporary.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++23
 
-// These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: apple-clang-16, clang-19.1
-
 // <type_traits>
 
 // template<class T, class U> struct reference_constructs_from_temporary;
diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
index 73cc4f3e29d5a..8319d9e1563fe 100644
--- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
+++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/reference_converts_from_temporary.pass.cpp
@@ -8,9 +8,6 @@
 
 // REQUIRES: std-at-least-c++23
 
-// These compilers don't support std::reference_converts_from_temporary yet.
-// UNSUPPORTED: apple-clang-16, clang-19.1
-
 // <type_traits>
 
 // template<class T, class U> struct reference_converts_from_temporary;
diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
index 2dfbae9138864..12d778408d5ec 100644
--- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
+++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.apply/make_from_tuple.verify.cpp
@@ -19,11 +19,7 @@
 #include "test_macros.h"
 
 void test() {
-  // FreeBSD ci use clang 19.1.1, which hasn't implement __reference_constructs_from_temporary.
-  // The static_assert inner std::make_from_tuple will not triggered.
-#if __has_builtin(__reference_constructs_from_temporary)
   // expected-error@*:* {{static assertion failed}}
-#endif
 
   // Turns to an error since C++26 (Disallow Binding a Returned Glvalue to a Temporary https://wg21.link/P2748R5).
 #if TEST_STD_VER >= 26
diff --git a/lld/test/wasm/runtime-relocations-himem.s b/lld/test/wasm/runtime-relocations-himem.s
new file mode 100644
index 0000000000000..a12a93a6cb933
--- /dev/null
+++ b/lld/test/wasm/runtime-relocations-himem.s
@@ -0,0 +1,60 @@
+## Verifies runtime relocation code for addresses over 2gb works correctly.
+## We have had issues with LEB encoding of address over 2gb in i32.const
+## instruction leading to invalid binaries.
+
+# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
+# RUN: wasm-ld --global-base=2147483648 --experimental-pic --unresolved-symbols=import-dynamic -no-gc-sections --shared-memory --no-entry -o %t.wasm %t.o
+# XUN: obj2yaml %t.wasm | FileCheck %s
+# RUN: llvm-objdump -d --no-show-raw-insn --no-leading-addr %t.wasm | FileCheck %s --
+
+.globl tls_sym
+.globl data_sym
+.globl _start
+.globaltype __tls_base, i32
+
+_start:
+  .functype _start () -> ()
+  global.get __tls_base
+  i32.const tls_sym@TLSREL
+  i32.add
+  drop
+  i32.const data_sym
+  drop
+  end_function
+
+.section tls_sec,"T",@
+.p2align  2
+tls_sym:
+  .int32 0
+  .int32 extern_sym
+  .size tls_sym, 8
+
+.section data_sec,"",@
+.p2align  2
+data_sym:
+  .int32 0
+  .int32 extern_sym
+  .size data_sym, 8
+
+.section  .custom_section.target_features,"",@
+  .int8 2
+  .int8 43
+  .int8 7
+  .ascii  "atomics"
+  .int8 43
+  .int8 11
+  .ascii  "bulk-memory"
+
+# CHECK: <__wasm_apply_data_relocs>:
+# CHECK-EMPTY:
+# CHECK-NEXT:  i32.const -2147483636
+# CHECK-NEXT:  global.get 0
+# CHECK-NEXT:  i32.store 0
+# CHECK-NEXT:  end
+
+# CHECK: <__wasm_apply_tls_relocs>:
+# CHECK-EMPTY:
+# CHECK-NEXT:  i32.const -2147483644
+# CHECK-NEXT:  global.get 0
+# CHECK-NEXT:  i32.store 0
+# CHECK-NEXT:  end
diff --git a/lld/test/wasm/stack-first.test b/lld/test/wasm/stack-first.test
index 72e1a006d5700..46e8c6ebc2381 100644
--- a/lld/test/wasm/stack-first.test
+++ b/lld/test/wasm/stack-first.test
@@ -8,6 +8,15 @@ RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %p/Inputs/stack-first.
 RUN: wasm-ld -z stack-size=512 --stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
 RUN: obj2yaml %t.wasm | FileCheck %s
 
+; Check `--no-stack-first`
+RUN: wasm-ld -z stack-size=512 --stack-first --no-stack-first --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
+RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST
+
+; Check that the default is no-stack-first
+RUN: wasm-ld -z stack-size=512 --export=__data_end --export=__heap_base --export=someByte -o %t.wasm %t.o
+RUN: obj2yaml %t.wasm | FileCheck %s --check-prefix=NOT-FIRST
+
+
 CHECK:        - Type:            GLOBAL
 CHECK-NEXT:     Globals:         
 CHECK-NEXT:       - Index:           0
@@ -51,3 +60,19 @@ CHECK-NEXT:         Index:           2
 CHECK-NEXT:       - Name:            __heap_base
 CHECK-NEXT:         Kind:            GLOBAL
 CHECK-NEXT:         Index:           3
+
+NOT-FIRST:        - Type:            GLOBAL
+NOT-FIRST-NEXT:     Globals:
+NOT-FIRST-NEXT:       - Index:           0
+NOT-FIRST-NEXT:         Type:            I32
+NOT-FIRST-NEXT:         Mutable:         true
+NOT-FIRST-NEXT:         InitExpr:
+NOT-FIRST-NEXT:           Opcode:          I32_CONST
+NOT-FIRST-NEXT:           Value:           1552
+NOT-FIRST-NEXT:       - Index:           1
+NOT-FIRST-NEXT:         Type:            I32
+NOT-FIRST-NEXT:         Mutable:         false
+NOT-FIRST-NEXT:         InitExpr:
+NOT-FIRST-NEXT:           Opcode:          I32_CONST
+NOT-FIRST-NEXT:           Value:           1024
+
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 9c0e1b58e62f9..aad8095881bba 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -595,7 +595,7 @@ static void readConfigs(opt::InputArgList &args) {
   ctx.arg.shlibSigCheck = !args.hasArg(OPT_no_shlib_sigcheck);
   ctx.arg.stripAll = args.hasArg(OPT_strip_all);
   ctx.arg.stripDebug = args.hasArg(OPT_strip_debug);
-  ctx.arg.stackFirst = args.hasArg(OPT_stack_first);
+  ctx.arg.stackFirst = args.hasFlag(OPT_stack_first, OPT_no_stack_first, false);
   ctx.arg.trace = args.hasArg(OPT_trace);
   ctx.arg.thinLTOCacheDir = args.getLastArgValue(OPT_thinlto_cache_dir);
   ctx.arg.thinLTOCachePolicy = CHECK(
diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp
index 44927e7a432bc..14e02e6009318 100644
--- a/lld/wasm/InputChunks.cpp
+++ b/lld/wasm/InputChunks.cpp
@@ -423,8 +423,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
 
   bool is64 = ctx.arg.is64.value_or(false);
   bool generated = false;
-  unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
-                                   : WASM_OPCODE_I32_CONST;
   unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD
                                  : WASM_OPCODE_I32_ADD;
 
@@ -451,8 +449,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
                       << " output offset=" << offset << "\n");
 
     // Calculate the address at which to apply the relocation
-    writeU8(os, opcode_ptr_const, "CONST");
-    writeSleb128(os, offset, "offset");
+    writePtrConst(os, offset, is64, "offset");
 
     // In PIC mode we need to add the __memory_base
     if (ctx.isPic) {
@@ -466,8 +463,6 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
 
     // Now figure out what we want to store at this location
     bool is64 = relocIs64(rel.Type);
-    unsigned opcode_reloc_const =
-        is64 ? WASM_OPCODE_I64_CONST : WASM_OPCODE_I32_CONST;
     unsigned opcode_reloc_add =
         is64 ? WASM_OPCODE_I64_ADD : WASM_OPCODE_I32_ADD;
     unsigned opcode_reloc_store =
@@ -477,8 +472,7 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       writeUleb128(os, sym->getGOTIndex(), "global index");
       if (rel.Addend) {
-        writeU8(os, opcode_reloc_const, "CONST");
-        writeSleb128(os, rel.Addend, "addend");
+        writePtrConst(os, rel.Addend, is64, "addend");
         writeU8(os, opcode_reloc_add, "ADD");
       }
     } else {
@@ -491,8 +485,8 @@ bool InputChunk::generateRelocationCode(raw_ostream &os) const {
         baseSymbol = ctx.sym.tlsBase;
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       writeUleb128(os, baseSymbol->getGlobalIndex(), "base");
-      writeU8(os, opcode_reloc_const, "CONST");
-      writeSleb128(os, file->calcNewValue(rel, tombstone, this), "offset");
+      writePtrConst(os, file->calcNewValue(rel, tombstone, this), is64,
+                    "offset");
       writeU8(os, opcode_reloc_add, "ADD");
     }
 
diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td
index 2f699e2f68350..0ed69258d99aa 100644
--- a/lld/wasm/Options.td
+++ b/lld/wasm/Options.td
@@ -250,8 +250,9 @@ def no_entry: FF<"no-entry">,
 def no_shlib_sigcheck: FF<"no-shlib-sigcheck">,
   HelpText<"Do not check signatures of functions defined in shared libraries.">;
 
-def stack_first: FF<"stack-first">,
-  HelpText<"Place stack at start of linear memory rather than after data">;
+defm stack_first: B<"stack-first",
+    "Place stack at start of linear memory",
+    "Place the stack after static data refion (default)">;
 
 def table_base: JJ<"table-base=">,
   HelpText<"Table offset at which to place address taken functions (Defaults to 1)">;
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index e1192706ea913..399a5084e6595 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -434,8 +434,6 @@ void GlobalSection::addInternalGOTEntry(Symbol *sym) {
 void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
   assert(!ctx.arg.extendedConst);
   bool is64 = ctx.arg.is64.value_or(false);
-  unsigned opcode_ptr_const = is64 ? WASM_OPCODE_I64_CONST
-                                   : WASM_OPCODE_I32_CONST;
   unsigned opcode_ptr_add = is64 ? WASM_OPCODE_I64_ADD
                                  : WASM_OPCODE_I32_ADD;
 
@@ -452,8 +450,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
         writeUleb128(os, ctx.sym.memoryBase->getGlobalIndex(), "__memory_base");
 
       // Add the virtual address of the data symbol
-      writeU8(os, opcode_ptr_const, "CONST");
-      writeSleb128(os, d->getVA(), "offset");
+      writePtrConst(os, d->getVA(), is64, "offset");
     } else if (auto *f = dyn_cast<FunctionSymbol>(sym)) {
       if (f->isStub)
         continue;
@@ -462,8 +459,7 @@ void GlobalSection::generateRelocationCode(raw_ostream &os, bool TLS) const {
       writeUleb128(os, ctx.sym.tableBase->getGlobalIndex(), "__table_base");
 
       // Add the table index to __table_base
-      writeU8(os, opcode_ptr_const, "CONST");
-      writeSleb128(os, f->getTableIndex(), "offset");
+      writePtrConst(os, f->getTableIndex(), is64, "offset");
     } else {
       assert(isa<UndefinedData>(sym) || isa<SharedData>(sym));
       continue;
diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index e3b72e94d4beb..01b5546fee00d 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -87,6 +87,12 @@ if (LLDB_ENABLE_PYTHON)
       set(LLDB_PYTHON_EXT_SUFFIX "_d${LLDB_PYTHON_EXT_SUFFIX}")
     endif()
   endif()
+  if(TARGET Python3::Python)
+    get_target_property(_Python3_LIB_PATH Python3::Python IMPORTED_LIBRARY_LOCATION)
+    if(_Python3_LIB_PATH)
+      get_filename_component(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME "${_Python3_LIB_PATH}" NAME)
+    endif()
+  endif()
 endif ()
 
 if (LLDB_ENABLE_LUA)
diff --git a/lldb/include/lldb/Target/InstrumentationRuntime.h b/lldb/include/lldb/Target/InstrumentationRuntime.h
index a6121c24b9560..d2499528e97ab 100644
--- a/lldb/include/lldb/Target/InstrumentationRuntime.h
+++ b/lldb/include/lldb/Target/InstrumentationRuntime.h
@@ -73,6 +73,13 @@ class InstrumentationRuntime
   /// is guaranteed to be loaded.
   virtual void Activate() = 0;
 
+  /// \return true if `CheckIfRuntimeIsValid` should be called on all modules.
+  /// In this case the return value of `GetPatternForRuntimeLibrary` will be
+  /// ignored. Return false if `CheckIfRuntimeIsValid` should only be called
+  /// for modules whose name matches `GetPatternForRuntimeLibrary`.
+  ///
+  virtual bool MatchAllModules() { return false; }
+
 public:
   static void ModulesDidLoad(lldb_private::ModuleList &module_list,
                              Process *process,
diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h
index 82774d56922a9..13455552131da 100644
--- a/lldb/include/lldb/Utility/Stream.h
+++ b/lldb/include/lldb/Utility/Stream.h
@@ -300,6 +300,12 @@ class Stream {
   ///     The current indentation level.
   unsigned GetIndentLevel() const;
 
+  /// Set the current indentation level.
+  ///
+  /// \param[in] level
+  ///     The new indentation level.
+  void SetIndentLevel(unsigned level);
+
   /// Indent the current line in the stream.
   ///
   /// Indent the current line using the current indentation level and print an
@@ -315,6 +321,20 @@ class Stream {
   /// Increment the current indentation level.
   void IndentMore(unsigned amount = 2);
 
+  struct IndentScope {
+    IndentScope(Stream &stream)
+        : m_stream(stream), m_original_indent_level(stream.GetIndentLevel()) {}
+    ~IndentScope() { m_stream.SetIndentLevel(m_original_indent_level); }
+
+  private:
+    Stream &m_stream;
+    unsigned m_original_indent_level;
+  };
+
+  /// Create an indentation scope that restores the original indent level when
+  /// the object goes out of scope (RAII).
+  IndentScope MakeIndentScope(unsigned indent_amount = 2);
+
   /// Output an offset value.
   ///
   /// Put an offset \a uval out to the stream using the printf format in \a
@@ -364,12 +384,6 @@ class Stream {
   ///     address and pointer values.
   void SetAddressByteSize(uint32_t addr_size);
 
-  /// Set the current indentation level.
-  ///
-  /// \param[in] level
-  ///     The new indentation level.
-  void SetIndentLevel(unsigned level);
-
   /// Output a SLEB128 number to the stream.
   ///
   /// Put an SLEB128 \a uval out to the stream using the printf format in \a
diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index 96c7b3987d8a1..024c9f1c7e435 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -258,6 +258,7 @@ def _getDebugInfoArgs(self, debug_info):
             "gmodules": {"MAKE_DSYM": "NO", "MAKE_GMODULES": "YES"},
             "debug_names": {"MAKE_DEBUG_NAMES": "YES"},
             "dwp": {"MAKE_DSYM": "NO", "MAKE_DWP": "YES"},
+            "pdb": {"MAKE_PDB": "YES"},
         }
 
         # Collect all flags, with later options overriding earlier ones
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index b92de941c4124..8c1eea97620e2 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1791,6 +1791,11 @@ def no_reason(_):
                         if can_replicate
                     ]
 
+                    # PDB is off by default, because it has a lot of failures right now.
+                    # See llvm.org/pr149498
+                    if original_testcase.TEST_WITH_PDB_DEBUG_INFO:
+                        dbginfo_categories.append("pdb")
+
                 xfail_for_debug_info_cat_fn = getattr(
                     attrvalue, "__xfail_for_debug_info_cat_fn__", no_reason
                 )
@@ -1878,6 +1883,13 @@ class TestBase(Base, metaclass=LLDBTestCaseFactory):
     # test multiple times with various debug info types.
     NO_DEBUG_INFO_TESTCASE = False
 
+    TEST_WITH_PDB_DEBUG_INFO = False
+    """
+    Subclasses can set this to True to test with PDB in addition to the other debug info
+    types. This id off by default because many tests will fail due to missing functionality in PDB.
+    See llvm.org/pr149498.
+    """
+
     def generateSource(self, source):
         template = source + ".template"
         temp = os.path.join(self.getSourceDir(), template)
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 28cae54776ac8..b3822db162a0b 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -249,6 +249,10 @@ ifeq ($(CC_TYPE), clang)
    MODULE_DEBUG_INFO_FLAGS += -gmodules
 endif
 
+ifeq "$(MAKE_PDB)" "YES"
+	DEBUG_INFO_FLAG ?= -g -gcodeview
+endif
+
 # If the OS is Windows, we need to pass -gdwarf to clang, otherwise it will build
 # with codeview by default but all the tests rely on dwarf.
 ifeq "$(OS)" "Windows_NT"
diff --git a/lldb/packages/Python/lldbsuite/test/test_categories.py b/lldb/packages/Python/lldbsuite/test/test_categories.py
index 1f6e8a78e0c0d..b8a764fb3349a 100644
--- a/lldb/packages/Python/lldbsuite/test/test_categories.py
+++ b/lldb/packages/Python/lldbsuite/test/test_categories.py
@@ -12,7 +12,13 @@
 
 # Key: Category name
 # Value: should be used in lldbtest's debug-info replication
-debug_info_categories = {"dwarf": True, "dwo": True, "dsym": True, "gmodules": False}
+debug_info_categories = {
+    "dwarf": True,
+    "dwo": True,
+    "dsym": True,
+    "pdb": False,
+    "gmodules": False,
+}
 
 all_categories = {
     "basic_process": "Basic process execution sniff tests.",
@@ -34,6 +40,7 @@
     "lldb-dap": "Tests for the Debug Adapter Protocol with lldb-dap",
     "llgs": "Tests for the gdb-server functionality of lldb-server",
     "msvcstl": "Test for MSVC STL data formatters",
+    "pdb": "Tests that can be run with PDB debug information",
     "pexpect": "Tests requiring the pexpect library to be available",
     "objc": "Tests related to the Objective-C programming language support",
     "pyapi": "Tests related to the Python API",
@@ -65,6 +72,8 @@ def is_supported_on_platform(category, platform, compiler_path):
         if platform not in ["darwin", "macosx", "ios", "watchos", "tvos", "bridgeos"]:
             return False
         return gmodules.is_compiler_clang_with_gmodules(compiler_path)
+    elif category == "pdb":
+        return platform == "windows"
     return True
 
 
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index d892c01f0bc71..ac550962cfb85 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -32,6 +32,10 @@
 # timeout by a factor of 10 if ASAN is enabled.
 DEFAULT_TIMEOUT = 10 * (10 if ("ASAN_OPTIONS" in os.environ) else 1)
 
+# See lldbtest.Base.spawnSubprocess, which should help ensure any processes
+# created by the DAP client are terminated correctly when the test ends.
+SpawnHelperCallback = Callable[[str, List[str], List[str]], subprocess.Popen]
+
 ## DAP type references
 
 
@@ -191,14 +195,16 @@ def __init__(
         self,
         recv: BinaryIO,
         send: BinaryIO,
-        init_commands: list[str],
-        log_file: Optional[TextIO] = None,
+        init_commands: Optional[List[str]] = None,
+        log_file: Optional[str] = None,
+        spawn_helper: Optional[SpawnHelperCallback] = None,
     ):
         # For debugging test failures, try setting `trace_file = sys.stderr`.
         self.trace_file: Optional[TextIO] = None
         self.log_file = log_file
         self.send = send
         self.recv = recv
+        self.spawn_helper = spawn_helper
 
         # Packets that have been received and processed but have not yet been
         # requested by a test case.
@@ -211,7 +217,7 @@ def __init__(
         self._recv_thread = threading.Thread(target=self._read_packet_thread)
 
         # session state
-        self.init_commands = init_commands
+        self.init_commands = init_commands if init_commands else []
         self.exit_status: Optional[int] = None
         self.capabilities: Dict = {}
         self.initialized: bool = False
@@ -310,11 +316,6 @@ def collect_output(
             output += self.get_output(category, clear=clear)
         return output
 
-    def _enqueue_recv_packet(self, packet: Optional[ProtocolMessage]):
-        with self.recv_condition:
-            self.recv_packets.append(packet)
-            self.recv_condition.notify()
-
     def _handle_recv_packet(self, packet: Optional[ProtocolMessage]) -> bool:
         """Handles an incoming packet.
 
@@ -460,22 +461,11 @@ def _handle_reverse_request(self, request: Request) -> None:
         self.reverse_requests.append(request)
         arguments = request.get("arguments")
         if request["command"] == "runInTerminal" and arguments is not None:
-            in_shell = arguments.get("argsCanBeInterpretedByShell", False)
-            print("spawning...", arguments["args"])
-            proc = subprocess.Popen(
-                arguments["args"],
-                env=arguments.get("env", {}),
-                cwd=arguments.get("cwd", None),
-                stdin=subprocess.DEVNULL,
-                stdout=sys.stderr,
-                stderr=sys.stderr,
-                shell=in_shell,
-            )
-            body = {}
-            if in_shell:
-                body["shellProcessId"] = proc.pid
-            else:
-                body["processId"] = proc.pid
+            assert self.spawn_helper is not None, "Not configured to spawn subprocesses"
+            [exe, *args] = arguments["args"]
+            env = [f"{k}={v}" for k, v in arguments.get("env", {}).items()]
+            proc = self.spawn_helper(exe, args, env)
+            body = {"processId": proc.pid}
             self.send_packet(
                 {
                     "type": "response",
@@ -1501,12 +1491,14 @@ def request_setInstructionBreakpoints(self, memory_reference=[]):
 class DebugAdapterServer(DebugCommunication):
     def __init__(
         self,
+        *,
         executable: Optional[str] = None,
         connection: Optional[str] = None,
-        init_commands: list[str] = [],
-        log_file: Optional[TextIO] = None,
-        env: Optional[dict[str, str]] = None,
-        additional_args: list[str] = [],
+        init_commands: Optional[list[str]] = None,
+        log_file: Optional[str] = None,
+        env: Optional[Dict[str, str]] = None,
+        additional_args: Optional[List[str]] = None,
+        spawn_helper: Optional[SpawnHelperCallback] = None,
     ):
         self.process = None
         self.connection = None
@@ -1532,13 +1524,21 @@ def __init__(
                 s = socket.create_connection((host.strip("[]"), int(port)))
             else:
                 raise ValueError("invalid connection: {}".format(connection))
-            DebugCommunication.__init__(
-                self, s.makefile("rb"), s.makefile("wb"), init_commands, log_file
+            super().__init__(
+                s.makefile("rb"),
+                s.makefile("wb"),
+                init_commands,
+                log_file,
+                spawn_helper,
             )
             self.connection = connection
         else:
-            DebugCommunication.__init__(
-                self, self.process.stdout, self.process.stdin, init_commands, log_file
+            super().__init__(
+                self.process.stdout,
+                self.process.stdin,
+                init_commands,
+                log_file,
+                spawn_helper,
             )
 
     @classmethod
@@ -1546,14 +1546,14 @@ def launch(
         cls,
         *,
         executable: str,
-        env: Optional[dict[str, str]] = None,
-        log_file: Optional[TextIO] = None,
+        env: Optional[Dict[str, str]] = None,
+        log_file: Optional[str] = None,
         connection: Optional[str] = None,
         connection_timeout: Optional[int] = None,
-        additional_args: list[str] = [],
+        additional_args: Optional[List[str]] = None,
     ) -> tuple[subprocess.Popen, Optional[str]]:
         adapter_env = os.environ.copy()
-        if env is not None:
+        if env:
             adapter_env.update(env)
 
         if log_file:
@@ -1561,7 +1561,8 @@ def launch(
         args = [executable]
 
         # Add additional arguments first (like --no-lldbinit)
-        args.extend(additional_args)
+        if additional_args:
+            args.extend(additional_args)
 
         if connection is not None:
             args.append("--connection")
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
index c6c4a3e2a4e1e..71ca60ebe8d34 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/lldbdap_testcase.py
@@ -39,6 +39,7 @@ def create_debug_adapter(
             log_file=log_file_path,
             env=lldbDAPEnv,
             additional_args=additional_args or [],
+            spawn_helper=self.spawnSubprocess,
         )
 
     def build_and_create_debug_adapter(
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 88a02dce35b9d..9133359fbf537 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -265,6 +265,29 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
 
   Options *GetOptions() override { return &m_options; }
 
+private:
+  void SkipHiddenFrames(Thread &thread, uint32_t frame_idx) {
+    uint32_t candidate_idx = frame_idx;
+    const unsigned max_depth = 12;
+    for (unsigned num_try = 0; num_try < max_depth; ++num_try) {
+      if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) {
+        candidate_idx = UINT32_MAX;
+        break;
+      }
+      candidate_idx += *m_options.relative_frame_offset;
+      if (auto candidate_sp = thread.GetStackFrameAtIndex(candidate_idx)) {
+        if (candidate_sp->IsHidden())
+          continue;
+        // Now candidate_idx is the first non-hidden frame.
+        break;
+      }
+      candidate_idx = UINT32_MAX;
+      break;
+    };
+    if (candidate_idx != UINT32_MAX)
+      m_options.relative_frame_offset = candidate_idx - frame_idx;
+  }
+
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "thread" for validity as eCommandRequiresThread ensures
@@ -278,28 +301,13 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
       if (frame_idx == UINT32_MAX)
         frame_idx = 0;
 
-      // If moving up/down by one, skip over hidden frames.
-      if (*m_options.relative_frame_offset == 1 ||
-          *m_options.relative_frame_offset == -1) {
-        uint32_t candidate_idx = frame_idx;
-        const unsigned max_depth = 12;
-        for (unsigned num_try = 0; num_try < max_depth; ++num_try) {
-          if (candidate_idx == 0 && *m_options.relative_frame_offset == -1) {
-            candidate_idx = UINT32_MAX;
-            break;
-          }
-          candidate_idx += *m_options.relative_frame_offset;
-          if (auto candidate_sp = thread->GetStackFrameAtIndex(candidate_idx)) {
-            if (candidate_sp->IsHidden())
-              continue;
-            // Now candidate_idx is the first non-hidden frame.
-            break;
-          }
-          candidate_idx = UINT32_MAX;
-          break;
-        };
-        if (candidate_idx != UINT32_MAX)
-          m_options.relative_frame_offset = candidate_idx - frame_idx;
+      // If moving up/down by one, skip over hidden frames, unless we started
+      // in a hidden frame.
+      if ((*m_options.relative_frame_offset == 1 ||
+           *m_options.relative_frame_offset == -1)) {
+        if (auto current_frame_sp = thread->GetStackFrameAtIndex(frame_idx);
+            !current_frame_sp->IsHidden())
+          SkipHiddenFrames(*thread, frame_idx);
       }
 
       if (*m_options.relative_frame_offset < 0) {
diff --git a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
index ff37b48d86ca8..a5547a4699ca9 100644
--- a/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
+++ b/lldb/source/Plugins/ABI/RISCV/ABISysV_riscv.cpp
@@ -798,6 +798,8 @@ bool ABISysV_riscv::RegisterIsCalleeSaved(const RegisterInfo *reg_info) {
           .Cases({"f8", "f9", "f18", "f19", "f20", "f21", "f22", "f23"},
                  is_hw_fp)
           .Cases({"f24", "f25", "f26", "f27"}, is_hw_fp)
+          // vlenb is constant and needed for vector unwinding.
+          .Case("vlenb", true)
           .Default(false);
 
   return is_callee_saved;
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
index 1717b0a896669..727c8290bceb4 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CMakeLists.txt
@@ -1,10 +1,13 @@
 add_lldb_library(lldbPluginCPPRuntime
   CPPLanguageRuntime.cpp
+  VerboseTrapFrameRecognizer.cpp
 
   LINK_LIBS
     lldbCore
     lldbSymbol
     lldbTarget
+  CLANG_LIBS
+    clangCodeGen
 )
 
 add_subdirectory(ItaniumABI)
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
index 21a5ebe53073a..913678b629f2f 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/CPPLanguageRuntime.cpp
@@ -12,6 +12,7 @@
 #include <memory>
 
 #include "CPPLanguageRuntime.h"
+#include "VerboseTrapFrameRecognizer.h"
 
 #include "llvm/ADT/StringRef.h"
 
@@ -107,12 +108,15 @@ class LibCXXFrameRecognizer : public StackFrameRecognizer {
 
 CPPLanguageRuntime::CPPLanguageRuntime(Process *process)
     : LanguageRuntime(process) {
-  if (process)
+  if (process) {
     process->GetTarget().GetFrameRecognizerManager().AddRecognizer(
         StackFrameRecognizerSP(new LibCXXFrameRecognizer()), {},
         std::make_shared<RegularExpression>("^std::__[^:]*::"),
         /*mangling_preference=*/Mangled::ePreferDemangledWithoutArguments,
         /*first_instruction_only=*/false);
+
+    RegisterVerboseTrapFrameRecognizer(*process);
+  }
 }
 
 bool CPPLanguageRuntime::IsAllowedRuntimeValue(ConstString name) {
diff --git a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
similarity index 81%
rename from lldb/source/Target/VerboseTrapFrameRecognizer.cpp
rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
index 03ab58b8c59a9..2b6bf2cd470e6 100644
--- a/lldb/source/Target/VerboseTrapFrameRecognizer.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.cpp
@@ -1,4 +1,4 @@
-#include "lldb/Target/VerboseTrapFrameRecognizer.h"
+#include "VerboseTrapFrameRecognizer.h"
 
 #include "lldb/Core/Module.h"
 #include "lldb/Symbol/Function.h"
@@ -95,33 +95,14 @@ VerboseTrapFrameRecognizer::RecognizeFrame(lldb::StackFrameSP frame_sp) {
   if (func_name.empty())
     return {};
 
-  static auto trap_regex =
-      llvm::Regex(llvm::formatv("^{0}\\$(.*)\\$(.*)$", ClangTrapPrefix).str());
-  SmallVector<llvm::StringRef, 3> matches;
-  std::string regex_err_msg;
-  if (!trap_regex.match(func_name, &matches, &regex_err_msg)) {
-    LLDB_LOGF(GetLog(LLDBLog::Unwind),
-              "Failed to parse match trap regex for '%s': %s", func_name.data(),
-              regex_err_msg.c_str());
-
-    return {};
-  }
-
-  // For `__clang_trap_msg$category$message$` we expect 3 matches:
-  // 1. entire string
-  // 2. category
-  // 3. message
-  if (matches.size() != 3) {
-    LLDB_LOGF(GetLog(LLDBLog::Unwind),
-              "Unexpected function name format. Expected '<trap prefix>$<trap "
-              "category>$<trap message>'$ but got: '%s'.",
-              func_name.data());
-
+  auto maybe_trap_reason =
+      clang::CodeGen::DemangleTrapReasonInDebugInfo(func_name);
+  if (!maybe_trap_reason.has_value()) {
+    LLDB_LOGF(GetLog(LLDBLog::Unwind), "Failed to demangle '%s' as trap reason",
+              func_name.str().c_str());
     return {};
   }
-
-  auto category = matches[1];
-  auto message = matches[2];
+  auto [category, message] = maybe_trap_reason.value();
 
   std::string stop_reason =
       category.empty() ? "<empty category>" : category.str();
diff --git a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
similarity index 63%
rename from lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h
rename to lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
index 7e045760a28be..7d7020f63c8d2 100644
--- a/lldb/include/lldb/Target/VerboseTrapFrameRecognizer.h
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/VerboseTrapFrameRecognizer.h
@@ -1,5 +1,13 @@
-#ifndef LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
-#define LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
+//===-- VerboseTrapFrameRecognizer.h --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
+#define LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
 
 #include "lldb/Target/StackFrameRecognizer.h"
 
@@ -36,4 +44,4 @@ class VerboseTrapFrameRecognizer : public StackFrameRecognizer {
 
 } // namespace lldb_private
 
-#endif // LLDB_TARGET_VERBOSETRAPFRAMERECOGNIZER_H
+#endif // LLDB_SOURCE_PLUGINS_LANGUAGERUNTIME_C_PLUS_PLUS_VERBOSETRAPFRAMERECOGNIZER_H
diff --git a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
index 49841e7307443..e06e69fb08305 100644
--- a/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
+++ b/lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp
@@ -2735,9 +2735,8 @@ static void ApplyELF64ABS64Relocation(Symtab *symtab, ELFRelocation &rel,
     // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
     WritableDataBuffer *data_buffer =
         llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
-    uint64_t *dst = reinterpret_cast<uint64_t *>(
-        data_buffer->GetBytes() + rel_section->GetFileOffset() +
-        ELFRelocation::RelocOffset64(rel));
+    void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() +
+                      ELFRelocation::RelocOffset64(rel);
     uint64_t val_offset = value + ELFRelocation::RelocAddend64(rel);
     memcpy(dst, &val_offset, sizeof(uint64_t));
   }
@@ -2762,9 +2761,8 @@ static void ApplyELF64ABS32Relocation(Symtab *symtab, ELFRelocation &rel,
     // ObjectFileELF creates a WritableDataBuffer in CreateInstance.
     WritableDataBuffer *data_buffer =
         llvm::cast<WritableDataBuffer>(data_buffer_sp.get());
-    uint32_t *dst = reinterpret_cast<uint32_t *>(
-        data_buffer->GetBytes() + rel_section->GetFileOffset() +
-        ELFRelocation::RelocOffset32(rel));
+    void *const dst = data_buffer->GetBytes() + rel_section->GetFileOffset() +
+                      ELFRelocation::RelocOffset32(rel);
     memcpy(dst, &truncated_addr, sizeof(uint32_t));
   }
 }
diff --git a/lldb/source/Target/CMakeLists.txt b/lldb/source/Target/CMakeLists.txt
index b7788e80eecac..8e6d51efad1f3 100644
--- a/lldb/source/Target/CMakeLists.txt
+++ b/lldb/source/Target/CMakeLists.txt
@@ -80,7 +80,6 @@ add_lldb_library(lldbTarget
   UnixSignals.cpp
   UnwindAssembly.cpp
   UnwindLLDB.cpp
-  VerboseTrapFrameRecognizer.cpp
 
   ADDITIONAL_HEADER_DIRS
     ${LLDB_INCLUDE_DIR}/lldb/Target
diff --git a/lldb/source/Target/InstrumentationRuntime.cpp b/lldb/source/Target/InstrumentationRuntime.cpp
index 7e58e8bf26cb1..d9800a8541f4e 100644
--- a/lldb/source/Target/InstrumentationRuntime.cpp
+++ b/lldb/source/Target/InstrumentationRuntime.cpp
@@ -55,7 +55,8 @@ void InstrumentationRuntime::ModulesDidLoad(
       return IterationAction::Continue;
 
     const RegularExpression &runtime_regex = GetPatternForRuntimeLibrary();
-    if (runtime_regex.Execute(file_spec.GetFilename().GetCString()) ||
+    if (MatchAllModules() ||
+        runtime_regex.Execute(file_spec.GetFilename().GetCString()) ||
         module_sp->IsExecutable()) {
       if (CheckIfRuntimeIsValid(module_sp)) {
         SetRuntimeModuleSP(module_sp);
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index fb9e7eb5ed1bd..42ce198a283da 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -65,7 +65,6 @@
 #include "lldb/Target/ThreadPlanCallFunction.h"
 #include "lldb/Target/ThreadPlanStack.h"
 #include "lldb/Target/UnixSignals.h"
-#include "lldb/Target/VerboseTrapFrameRecognizer.h"
 #include "lldb/Utility/AddressableBits.h"
 #include "lldb/Utility/Event.h"
 #include "lldb/Utility/LLDBLog.h"
@@ -513,7 +512,6 @@ Process::Process(lldb::TargetSP target_sp, ListenerSP listener_sp,
   // We should have a plugin do the registration instead, for example, a
   // common C LanguageRuntime plugin.
   RegisterAssertFrameRecognizer(this);
-  RegisterVerboseTrapFrameRecognizer(*this);
 }
 
 Process::~Process() {
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 1e43094421f0a..a23091ad09c6d 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -3962,9 +3962,7 @@ void Target::StopHook::GetDescription(Stream &s,
     return;
   }
 
-  unsigned indent_level = s.GetIndentLevel();
-
-  s.SetIndentLevel(indent_level + 2);
+  auto indent_scope = s.MakeIndentScope();
 
   s.Printf("Hook: %" PRIu64 "\n", GetID());
   if (m_active)
@@ -3978,19 +3976,17 @@ void Target::StopHook::GetDescription(Stream &s,
   if (m_specifier_sp) {
     s.Indent();
     s.PutCString("Specifier:\n");
-    s.SetIndentLevel(indent_level + 4);
+    auto indent_scope = s.MakeIndentScope();
     m_specifier_sp->GetDescription(&s, level);
-    s.SetIndentLevel(indent_level + 2);
   }
 
   if (m_thread_spec_up) {
     StreamString tmp;
     s.Indent("Thread:\n");
     m_thread_spec_up->GetDescription(&tmp, level);
-    s.SetIndentLevel(indent_level + 4);
+    auto indent_scope = s.MakeIndentScope();
     s.Indent(tmp.GetString());
     s.PutCString("\n");
-    s.SetIndentLevel(indent_level + 2);
   }
   GetSubclassDescription(s, level);
 }
@@ -4003,14 +3999,13 @@ void Target::StopHookCommandLine::GetSubclassDescription(
       s.PutCString(m_commands.GetStringAtIndex(0));
     return;
   }
-  s.Indent("Commands: \n");
-  s.SetIndentLevel(s.GetIndentLevel() + 4);
+  s.Indent("Commands:\n");
+  auto indent_scope = s.MakeIndentScope(4);
   uint32_t num_commands = m_commands.GetSize();
   for (uint32_t i = 0; i < num_commands; i++) {
     s.Indent(m_commands.GetStringAtIndex(i));
     s.PutCString("\n");
   }
-  s.SetIndentLevel(s.GetIndentLevel() - 4);
 }
 
 // Target::StopHookCommandLine
@@ -4145,7 +4140,7 @@ void Target::StopHookScripted::GetSubclassDescription(
     return;
 
   s.Indent("Args:\n");
-  s.SetIndentLevel(s.GetIndentLevel() + 4);
+  auto indent_scope = s.MakeIndentScope(4);
 
   auto print_one_element = [&s](llvm::StringRef key,
                                 StructuredData::Object *object) {
@@ -4155,8 +4150,6 @@ void Target::StopHookScripted::GetSubclassDescription(
   };
 
   as_dict->ForEach(print_one_element);
-
-  s.SetIndentLevel(s.GetIndentLevel() - 4);
 }
 
 static constexpr OptionEnumValueElement g_dynamic_value_types[] = {
diff --git a/lldb/source/Utility/RegisterValue.cpp b/lldb/source/Utility/RegisterValue.cpp
index 12c349a143c0f..8b2af4e3d4f0e 100644
--- a/lldb/source/Utility/RegisterValue.cpp
+++ b/lldb/source/Utility/RegisterValue.cpp
@@ -206,7 +206,7 @@ Status RegisterValue::SetValueFromData(const RegisterInfo &reg_info,
         int128.x[0] = data2;
         int128.x[1] = data1;
       }
-      SetUInt128(llvm::APInt(128, 2, int128.x));
+      SetUInt128(llvm::APInt(128, int128.x));
     }
     break;
   case eEncodingIEEE754:
@@ -596,8 +596,10 @@ llvm::APInt RegisterValue::GetAsUInt128(const llvm::APInt &fail_value,
     case 8:
     case 16:
       return llvm::APInt(
-          BITWIDTH_INT128, NUM_OF_WORDS_INT128,
-          (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x);
+          BITWIDTH_INT128,
+          llvm::ArrayRef(
+              (reinterpret_cast<const type128 *>(buffer.bytes.data()))->x,
+              NUM_OF_WORDS_INT128));
     }
   } break;
   }
diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp
index 89dce9fb0e1f7..e9632c3e1fc1f 100644
--- a/lldb/source/Utility/Stream.cpp
+++ b/lldb/source/Utility/Stream.cpp
@@ -202,6 +202,14 @@ void Stream::IndentLess(unsigned amount) {
     m_indent_level = 0;
 }
 
+// Create an indentation scope that restores the original indent level when the
+// object goes out of scope (RAII).
+Stream::IndentScope Stream::MakeIndentScope(unsigned indent_amount) {
+  IndentScope indent_scope(*this);
+  IndentMore(indent_amount);
+  return indent_scope;
+}
+
 // Get the address size in bytes
 uint32_t Stream::GetAddressByteSize() const { return m_addr_size; }
 
diff --git a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
index 50efecbc88c36..bed129a7a7a8c 100644
--- a/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
+++ b/lldb/test/API/commands/expression/weak_symbols/TestWeakSymbols.py
@@ -15,7 +15,7 @@ class TestWeakSymbolsInExpressions(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @skipUnlessDarwin
-    @skipIf(compiler="clang", compiler_version=["<", "7.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_weak_symbol_in_expr(self):
         """Tests that we can refer to weak symbols in expressions."""
         self.build()
diff --git a/lldb/test/API/commands/frame/select-hidden/Makefile b/lldb/test/API/commands/frame/select-hidden/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py
new file mode 100644
index 0000000000000..698447b552877
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/TestNavigateHiddenFrame.py
@@ -0,0 +1,32 @@
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class NavigateHiddenFrameTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @add_test_categories(["libc++"])
+    def test(self):
+        """Test going up/down a backtrace but we started in a hidden frame."""
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "Break here", lldb.SBFileSpec("main.cpp")
+        )
+        # up
+        self.assertIn("__impl2", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("__impl1", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("__impl", thread.selected_frame.GetFunctionName())
+        self.expect("up")
+        self.assertIn("non_impl", thread.selected_frame.GetFunctionName())
+
+        # Back down again.
+        self.expect("down")
+        self.assertIn("__impl", thread.selected_frame.GetFunctionName())
+        self.expect("down")
+        self.assertIn("__impl1", thread.selected_frame.GetFunctionName())
+        self.expect("down")
+        self.assertIn("__impl2", thread.selected_frame.GetFunctionName())
diff --git a/lldb/test/API/commands/frame/select-hidden/main.cpp b/lldb/test/API/commands/frame/select-hidden/main.cpp
new file mode 100644
index 0000000000000..dc97abb6323a4
--- /dev/null
+++ b/lldb/test/API/commands/frame/select-hidden/main.cpp
@@ -0,0 +1,13 @@
+namespace std {
+namespace __1 {
+static const char *__impl2() { return "Break here"; }
+static const char *__impl1() { return __impl2(); }
+static const char *__impl() { return __impl1(); }
+static const char *non_impl() { return __impl(); }
+} // namespace __1
+} // namespace std
+
+int main() {
+  std::__1::non_impl();
+  __builtin_debugtrap();
+}
diff --git a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
index 2f942da604ff2..eeb5d1b554b01 100644
--- a/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
+++ b/lldb/test/API/lang/cpp/libcxx-internals-recognizer/TestLibcxxInternalsRecognizer.py
@@ -9,7 +9,7 @@ class LibCxxInternalsRecognizerTestCase(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @add_test_categories(["libc++"])
-    @skipIf(compiler="clang", compiler_version=["<=", "19.0"])
+    @skipIf(compiler="clang", compiler_version=["<", "21.0"])
     def test_frame_recognizer(self):
         """Test that implementation details of libc++ are hidden"""
         self.build()
diff --git a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
index 142d27ddad37f..f3558f62d51f8 100644
--- a/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
+++ b/lldb/test/API/lang/objc/modules-auto-import/TestModulesAutoImport.py
@@ -16,6 +16,7 @@ def setUp(self):
         self.line = line_number("main.m", "// Set breakpoint 0 here.")
 
     @skipIf(macos_version=["<", "10.12"])
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_expr(self):
         self.build()
         exe = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
index 3be064ae7d5f8..657a7103ee989 100644
--- a/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
+++ b/lldb/test/API/lang/objc/modules-objc-property/TestModulesObjCProperty.py
@@ -6,6 +6,7 @@
 
 class TestCase(TestBase):
     @no_debug_info_test
+    @skipIf(compiler="clang", compiler_version=["<", "19.0"])
     def test_conflicting_properties(self):
         """Tests receiving two properties with the same name from modules."""
         self.build()
diff --git a/lldb/test/API/test_utils/pdb/Makefile b/lldb/test/API/test_utils/pdb/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/test_utils/pdb/TestPdb.py b/lldb/test/API/test_utils/pdb/TestPdb.py
new file mode 100644
index 0000000000000..bd3a9d0c34ab3
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/TestPdb.py
@@ -0,0 +1,18 @@
+"""
+Test PDB enabled tests
+"""
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+class TestBuildMethod(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
+    def test(self):
+        self.build()
+        self.assertTrue(self.dbg.CreateTarget(self.getBuildArtifact()))
+        if self.getDebugInfo() == "pdb":
+            self.expect(
+                "target modules dump symfile", patterns=["SymbolFile (native-)?pdb"]
+            )
diff --git a/lldb/test/API/test_utils/pdb/main.cpp b/lldb/test/API/test_utils/pdb/main.cpp
new file mode 100644
index 0000000000000..76e8197013aab
--- /dev/null
+++ b/lldb/test/API/test_utils/pdb/main.cpp
@@ -0,0 +1 @@
+int main() { return 0; }
diff --git a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
index 09e3f62f0eead..19f88d88c2ff4 100644
--- a/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
+++ b/lldb/test/API/tools/lldb-dap/disconnect/TestDAP_disconnect.py
@@ -3,17 +3,15 @@
 """
 
 
-import dap_server
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
 import lldbdap_testcase
-import subprocess
 import time
 import os
 
 
-class TestDAP_launch(lldbdap_testcase.DAPTestCaseBase):
+class TestDAP_disconnect(lldbdap_testcase.DAPTestCaseBase):
     source = "main.cpp"
 
     def disconnect_and_assert_no_output_printed(self):
@@ -67,10 +65,11 @@ def test_attach(self):
             lambda: self.run_platform_command("rm %s" % (sync_file_path))
         )
 
-        self.process = subprocess.Popen([program, sync_file_path])
+        proc = self.spawnSubprocess(program, [sync_file_path])
         lldbutil.wait_for_file_on_target(self, sync_file_path)
 
-        self.attach(pid=self.process.pid, disconnectAutomatically=False)
+        self.attach(pid=proc.pid, disconnectAutomatically=False, stopOnEntry=True)
+        self.continue_to_next_stop()
         response = self.dap_server.request_evaluate("wait_for_attach = false;")
         self.assertTrue(response["success"])
 
diff --git a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
index 12b321cf42778..3c53cf2ed3460 100644
--- a/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
+++ b/lldb/test/API/tools/lldb-dap/server/TestDAP_server.py
@@ -37,7 +37,7 @@ def cleanup():
 
     def run_debug_session(self, connection, name, sleep_seconds_in_middle=None):
         self.dap_server = dap_server.DebugAdapterServer(
-            connection=connection,
+            connection=connection, spawn_helper=self.spawnSubprocess
         )
         program = self.getBuildArtifact("a.out")
         source = "main.c"
@@ -94,6 +94,7 @@ def test_server_interrupt(self):
         (process, connection) = self.start_server(connection="listen://localhost:0")
         self.dap_server = dap_server.DebugAdapterServer(
             connection=connection,
+            spawn_helper=self.spawnSubprocess,
         )
         program = self.getBuildArtifact("a.out")
         source = "main.c"
diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index 513d1ec493ee1..818dff58aceeb 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -202,7 +202,7 @@ if(TARGET clang)
     else()
       # We require libcxx for the test suite, so if we aren't building it,
       # provide a helpful error about how to resolve the situation.
-      if(NOT LLDB_HAS_LIBCXX)
+      if(LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS AND NOT LLDB_HAS_LIBCXX)
         message(SEND_ERROR
           "LLDB test suite requires libc++, but it is currently disabled. "
           "Please add `libcxx` to `LLVM_ENABLE_RUNTIMES` or disable tests via "
diff --git a/lldb/test/Shell/DAP/TestClientLauncher.test b/lldb/test/Shell/DAP/TestClientLauncher.test
new file mode 100644
index 0000000000000..a79a940da5a98
--- /dev/null
+++ b/lldb/test/Shell/DAP/TestClientLauncher.test
@@ -0,0 +1,2 @@
+# RUN: lldb-dap --client vscode-url -- /path/to/foo | FileCheck %s
+# CHECK: vscode://llvm-vs-code-extensions.lldb-dap/start?program=%2Fpath%2Fto%2Ffoo
diff --git a/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test
new file mode 100644
index 0000000000000..a9557801cc134
--- /dev/null
+++ b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list-format.test
@@ -0,0 +1,36 @@
+# Test format (e.g., indentation) when printing the list of stop hooks.
+#
+# RUN: %lldb -b -s %s | FileCheck %s --match-full-lines --strict-whitespace
+
+# Create some stop hooks
+target stop-hook add -o 'print "Hello"' --auto-continue true --at-initial-stop true
+target stop-hook add -o 'print "world,"' -o 'print "nice"' --file 'my_file'
+target stop-hook add -o 'print "weather!"'  --classname 'MyClass' --thread-name 'my_thread'
+
+# Print hooks
+target stop-hook list
+
+# CHECK:(lldb) target stop-hook list
+# CHECK:Hook: 1
+# CHECK:  State: enabled
+# CHECK:  AutoContinue on
+# CHECK:  Commands:
+# CHECK:      print "Hello"
+# CHECK-EMPTY:
+# CHECK:Hook: 2
+# CHECK:  State: enabled
+# CHECK:  Specifier:
+# CHECK:    File: my_file.
+# CHECK:  Commands:
+# CHECK:      print "world,"
+# CHECK:      print "nice"
+# CHECK-EMPTY:
+# CHECK:Hook: 3
+# CHECK:  State: enabled
+# CHECK:  Specifier:
+# CHECK:    Class name: MyClass.
+# CHECK:  Thread:
+# CHECK:    thread name: "my_thread" 
+# CHECK:  Commands:
+# CHECK:      print "weather!"
+# CHECK-EMPTY:
diff --git a/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m
new file mode 100644
index 0000000000000..83a829a8c2fdd
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/Inputs/verbose_trap.m
@@ -0,0 +1,4 @@
+int main() {
+  __builtin_verbose_trap("Foo", "Bar");
+  return 0;
+}
diff --git a/lldb/test/Shell/Recognizer/registration-unique.test b/lldb/test/Shell/Recognizer/registration-unique.test
new file mode 100644
index 0000000000000..34400d9a27575
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/registration-unique.test
@@ -0,0 +1,56 @@
+# UNSUPPORTED: system-windows
+
+# Checks that the recognizers that should work across language runtimes
+# are only registered once with the target.
+
+# RUN: split-file %s %t
+
+# RUN: %clang_host %t/main.cpp -g -o %t/cpp.out
+# RUN: %lldb -b -s %t/commands.input %t/cpp.out | FileCheck %s
+
+# RUN: %clang_host -x objective-c++ %t/main.mm -g -o %t/objcxx.out
+# RUN: %lldb -b -s %t/commands.input %t/objcxx.out | FileCheck %s
+
+# RUN: %clang_host %t/main.c -g -o %t/c.out
+# RUN: %lldb -b -s %t/commands.input %t/c.out | FileCheck %s
+
+# RUN: %clang_host -x objective-c %t/main.m -g -o %t/objc.out
+# RUN: %lldb -b -s %t/commands.input %t/objc.out | FileCheck %s
+
+#--- main.m
+int main() {}
+
+#--- main.c
+int main() {}
+
+#--- main.mm
+int main() {}
+
+#--- main.cpp
+int main() {}
+
+#--- commands.input
+
+b main
+frame recognizer list
+run
+frame recognizer list
+continue
+run
+frame recognizer list
+
+# CHECK: frame recognizer list
+# CHECK-NEXT: no matching results found.
+
+# CHECK: frame recognizer list
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer
+# CHECK-NOT: Verbose Trap StackFrame Recognizer
+# CHECK-NOT: Assert StackFrame Recognizer
+
+# FIXME: avoid duplicate frame recognizers in the target: https://github.com/llvm/llvm-project/issues/166341
+# CHECK: frame recognizer list
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer
+# CHECK-DAG: Verbose Trap StackFrame Recognizer
+# CHECK-DAG: Assert StackFrame Recognizer
diff --git a/lldb/test/Shell/Recognizer/verbose_trap-objc.test b/lldb/test/Shell/Recognizer/verbose_trap-objc.test
new file mode 100644
index 0000000000000..789e79c9542c5
--- /dev/null
+++ b/lldb/test/Shell/Recognizer/verbose_trap-objc.test
@@ -0,0 +1,12 @@
+# REQUIRES: system-darwin
+#
+# RUN: %clangxx_host -x objective-c -g %S/Inputs/verbose_trap.m -o %t.out
+# RUN: %lldb -b -s %s %t.out | FileCheck %s
+
+run
+# CHECK: thread #{{.*}}stop reason = Foo: Bar
+frame info
+# CHECK: frame #{{.*}}`main at verbose_trap.m
+frame recognizer info 0
+# CHECK: frame 0 is recognized by Verbose Trap StackFrame Recognizer
+q
diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt
index 67956af7fe3fb..efe51506f3545 100644
--- a/lldb/tools/driver/CMakeLists.txt
+++ b/lldb/tools/driver/CMakeLists.txt
@@ -37,6 +37,9 @@ add_dependencies(lldb
 if(DEFINED LLDB_PYTHON_DLL_RELATIVE_PATH)
   target_compile_definitions(lldb PRIVATE LLDB_PYTHON_DLL_RELATIVE_PATH="${LLDB_PYTHON_DLL_RELATIVE_PATH}")
 endif()
+if(DEFINED LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME)
+  target_compile_definitions(lldb PRIVATE LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME="${LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME}")
+endif()
 
 if(LLDB_BUILD_FRAMEWORK)
   # In the build-tree, we know the exact path to the framework directory.
diff --git a/lldb/tools/driver/Driver.cpp b/lldb/tools/driver/Driver.cpp
index 733331f4ddac0..bebf1a70d50e9 100644
--- a/lldb/tools/driver/Driver.cpp
+++ b/lldb/tools/driver/Driver.cpp
@@ -433,7 +433,8 @@ SBError Driver::ProcessArgs(const opt::InputArgList &args, bool &exiting) {
   return error;
 }
 
-#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
+#ifdef _WIN32
+#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH
 /// Returns the full path to the lldb.exe executable.
 inline std::wstring GetPathToExecutableW() {
   // Iterate until we reach the Windows API maximum path length (32,767).
@@ -447,30 +448,73 @@ inline std::wstring GetPathToExecutableW() {
   return L"";
 }
 
-/// Resolve the full path of the directory defined by
+/// \brief Resolve the full path of the directory defined by
 /// LLDB_PYTHON_DLL_RELATIVE_PATH. If it exists, add it to the list of DLL
 /// search directories.
-void AddPythonDLLToSearchPath() {
+/// \return `true` if the library was added to the search path.
+/// `false` otherwise.
+bool AddPythonDLLToSearchPath() {
   std::wstring modulePath = GetPathToExecutableW();
-  if (modulePath.empty()) {
-    llvm::errs() << "error: unable to find python.dll." << '\n';
-    return;
-  }
+  if (modulePath.empty())
+    return false;
 
   SmallVector<char, MAX_PATH> utf8Path;
   if (sys::windows::UTF16ToUTF8(modulePath.c_str(), modulePath.length(),
                                 utf8Path))
-    return;
+    return false;
   sys::path::remove_filename(utf8Path);
   sys::path::append(utf8Path, LLDB_PYTHON_DLL_RELATIVE_PATH);
   sys::fs::make_absolute(utf8Path);
 
   SmallVector<wchar_t, 1> widePath;
   if (sys::windows::widenPath(utf8Path.data(), widePath))
-    return;
+    return false;
 
   if (sys::fs::exists(utf8Path))
-    SetDllDirectoryW(widePath.data());
+    return SetDllDirectoryW(widePath.data());
+  return false;
+}
+#endif
+
+#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME
+/// Returns whether `python3x.dll` is in the DLL search path.
+bool IsPythonDLLInPath() {
+#define WIDEN2(x) L##x
+#define WIDEN(x) WIDEN2(x)
+  WCHAR foundPath[MAX_PATH];
+  DWORD result =
+      SearchPathW(nullptr, WIDEN(LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME), nullptr,
+                  MAX_PATH, foundPath, nullptr);
+#undef WIDEN2
+#undef WIDEN
+
+  return result > 0;
+}
+#endif
+
+/// Try to setup the DLL search path for the Python Runtime Library
+/// (python3xx.dll).
+///
+/// If `LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME` is set, we first check if
+/// python3xx.dll is in the search path. If it's not, we try to add it and
+/// check for it a second time.
+/// If only `LLDB_PYTHON_DLL_RELATIVE_PATH` is set, we try to add python3xx.dll
+/// to the search path python.dll is already in the search path or not.
+void SetupPythonRuntimeLibrary() {
+#ifdef LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME
+  if (IsPythonDLLInPath())
+    return;
+#ifdef LLDB_PYTHON_DLL_RELATIVE_PATH
+  if (AddPythonDLLToSearchPath() && IsPythonDLLInPath())
+    return;
+#endif
+  llvm::errs() << "error: unable to find '"
+               << LLDB_PYTHON_RUNTIME_LIBRARY_FILENAME << "'.\n";
+  return;
+#elif defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
+  if (!AddPythonDLLToSearchPath())
+    llvm::errs() << "error: unable to find the Python runtime library.\n";
+#endif
 }
 #endif
 
@@ -776,8 +820,8 @@ int main(int argc, char const *argv[]) {
                         "~/Library/Logs/DiagnosticReports/.\n");
 #endif
 
-#if defined(_WIN32) && defined(LLDB_PYTHON_DLL_RELATIVE_PATH)
-  AddPythonDLLToSearchPath();
+#ifdef _WIN32
+  SetupPythonRuntimeLibrary();
 #endif
 
   // Parse arguments.
diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index dd1bbbdddfc59..fa940b7b73943 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS Support)
 add_lldb_library(lldbDAP
   Breakpoint.cpp
   BreakpointBase.cpp
+  ClientLauncher.cpp
   CommandPlugins.cpp
   DAP.cpp
   DAPError.cpp
diff --git a/lldb/tools/lldb-dap/ClientLauncher.cpp b/lldb/tools/lldb-dap/ClientLauncher.cpp
new file mode 100644
index 0000000000000..4cac1d6346441
--- /dev/null
+++ b/lldb/tools/lldb-dap/ClientLauncher.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClientLauncher.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace lldb_dap;
+
+std::optional<ClientLauncher::Client>
+ClientLauncher::GetClientFrom(llvm::StringRef str) {
+  return llvm::StringSwitch<std::optional<ClientLauncher::Client>>(str.lower())
+      .Case("vscode", ClientLauncher::VSCode)
+      .Case("vscode-url", ClientLauncher::VSCodeURL)
+      .Default(std::nullopt);
+}
+
+std::unique_ptr<ClientLauncher>
+ClientLauncher::GetLauncher(ClientLauncher::Client client) {
+  switch (client) {
+  case ClientLauncher::VSCode:
+    return std::make_unique<VSCodeLauncher>();
+  case ClientLauncher::VSCodeURL:
+    return std::make_unique<VSCodeURLPrinter>();
+  }
+  return nullptr;
+}
+
+std::string VSCodeLauncher::URLEncode(llvm::StringRef str) {
+  std::string out;
+  llvm::raw_string_ostream os(out);
+  for (char c : str) {
+    if (std::isalnum(c) || llvm::StringRef("-_.~").contains(c))
+      os << c;
+    else
+      os << '%' << llvm::utohexstr(c, false, 2);
+  }
+  return os.str();
+}
+
+std::string
+VSCodeLauncher::GetLaunchURL(const std::vector<llvm::StringRef> args) const {
+  assert(!args.empty() && "empty launch args");
+
+  std::vector<std::string> encoded_launch_args;
+  for (llvm::StringRef arg : args)
+    encoded_launch_args.push_back(URLEncode(arg));
+
+  const std::string args_str = llvm::join(encoded_launch_args, "&args=");
+  return llvm::formatv(
+             "vscode://llvm-vs-code-extensions.lldb-dap/start?program={0}",
+             args_str)
+      .str();
+}
+
+llvm::Error VSCodeLauncher::Launch(const std::vector<llvm::StringRef> args) {
+  const std::string launch_url = GetLaunchURL(args);
+  const std::string command =
+      llvm::formatv("code --open-url {0}", launch_url).str();
+
+  std::system(command.c_str());
+  return llvm::Error::success();
+}
+
+llvm::Error VSCodeURLPrinter::Launch(const std::vector<llvm::StringRef> args) {
+  llvm::outs() << GetLaunchURL(args) << '\n';
+  return llvm::Error::success();
+}
diff --git a/lldb/tools/lldb-dap/ClientLauncher.h b/lldb/tools/lldb-dap/ClientLauncher.h
new file mode 100644
index 0000000000000..780b178d2d6ef
--- /dev/null
+++ b/lldb/tools/lldb-dap/ClientLauncher.h
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H
+#define LLDB_TOOLS_LLDB_DAP_CLIENTLAUNCHER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <vector>
+
+namespace lldb_dap {
+
+class ClientLauncher {
+public:
+  enum Client {
+    VSCode,
+    VSCodeURL,
+  };
+
+  virtual ~ClientLauncher() = default;
+  virtual llvm::Error Launch(const std::vector<llvm::StringRef> args) = 0;
+
+  static std::optional<Client> GetClientFrom(llvm::StringRef str);
+  static std::unique_ptr<ClientLauncher> GetLauncher(Client client);
+};
+
+class VSCodeLauncher : public ClientLauncher {
+public:
+  using ClientLauncher::ClientLauncher;
+
+  llvm::Error Launch(const std::vector<llvm::StringRef> args) override;
+
+  std::string GetLaunchURL(const std::vector<llvm::StringRef> args) const;
+  static std::string URLEncode(llvm::StringRef str);
+};
+
+class VSCodeURLPrinter : public VSCodeLauncher {
+  using VSCodeLauncher::VSCodeLauncher;
+
+  llvm::Error Launch(const std::vector<llvm::StringRef> args) override;
+};
+
+} // namespace lldb_dap
+
+#endif
diff --git a/lldb/tools/lldb-dap/tool/Options.td b/lldb/tools/lldb-dap/tool/Options.td
index 5e9dd7a1d6419..339a64fed6c32 100644
--- a/lldb/tools/lldb-dap/tool/Options.td
+++ b/lldb/tools/lldb-dap/tool/Options.td
@@ -82,3 +82,11 @@ def connection_timeout: S<"connection-timeout">,
     "timeout is reached, the server will be closed and the process will exit. "
     "Not specifying this argument or specifying non-positive values will "
     "cause the server to wait for new connections indefinitely.">;
+
+def client
+    : S<"client">,
+      MetaVarName<"<client>">,
+      HelpText<
+          "Use lldb-dap as a launcher for a curated number of DAP client.">;
+
+def REM : R<["--"], "">;
diff --git a/lldb/tools/lldb-dap/tool/lldb-dap.cpp b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
index 45caa1a81059b..f10ed12344cbd 100644
--- a/lldb/tools/lldb-dap/tool/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/tool/lldb-dap.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ClientLauncher.h"
 #include "DAP.h"
 #include "DAPLog.h"
 #include "EventHelper.h"
@@ -141,6 +142,12 @@ static void PrintHelp(LLDBDAPOptTable &table, llvm::StringRef tool_name) {
   debugger to attach to the process.
 
     lldb-dap -g
+
+  You can also use lldb-dap to launch a supported client, for example the
+  LLDB-DAP Visual Studio Code extension.
+
+    lldb-dap --client vscode -- /path/to/binary <args>
+
 )___";
 }
 
@@ -150,6 +157,29 @@ static void PrintVersion() {
   llvm::outs() << "liblldb: " << lldb::SBDebugger::GetVersionString() << '\n';
 }
 
+static llvm::Error LaunchClient(const llvm::opt::InputArgList &args) {
+  auto *client_arg = args.getLastArg(OPT_client);
+  assert(client_arg && "must have client arg");
+
+  std::optional<ClientLauncher::Client> client =
+      ClientLauncher::GetClientFrom(client_arg->getValue());
+  if (!client)
+    return llvm::createStringError(
+        llvm::formatv("unsupported client: {0}", client_arg->getValue()));
+
+  std::vector<llvm::StringRef> launch_args;
+  if (auto *arg = args.getLastArgNoClaim(OPT_REM)) {
+    for (auto *value : arg->getValues()) {
+      launch_args.push_back(value);
+    }
+  }
+
+  if (launch_args.empty())
+    return llvm::createStringError("no launch arguments provided");
+
+  return ClientLauncher::GetLauncher(*client)->Launch(launch_args);
+}
+
 #if not defined(_WIN32)
 struct FDGroup {
   int GetFlags() const {
@@ -541,6 +571,14 @@ int main(int argc, char *argv[]) {
     return EXIT_SUCCESS;
   }
 
+  if (input_args.hasArg(OPT_client)) {
+    if (llvm::Error error = LaunchClient(input_args)) {
+      llvm::WithColor::error() << llvm::toString(std::move(error)) << '\n';
+      return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+  }
+
   ReplMode default_repl_mode = ReplMode::Auto;
   if (input_args.hasArg(OPT_repl_mode)) {
     llvm::opt::Arg *repl_mode = input_args.getLastArg(OPT_repl_mode);
diff --git a/lldb/unittests/DAP/CMakeLists.txt b/lldb/unittests/DAP/CMakeLists.txt
index a08414c30e6cd..b1fdef18fddba 100644
--- a/lldb/unittests/DAP/CMakeLists.txt
+++ b/lldb/unittests/DAP/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_lldb_unittest(DAPTests
+  ClientLauncherTest.cpp
   DAPErrorTest.cpp
   DAPTest.cpp
   DAPTypesTest.cpp
diff --git a/lldb/unittests/DAP/ClientLauncherTest.cpp b/lldb/unittests/DAP/ClientLauncherTest.cpp
new file mode 100644
index 0000000000000..dbaf9ee786336
--- /dev/null
+++ b/lldb/unittests/DAP/ClientLauncherTest.cpp
@@ -0,0 +1,71 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ClientLauncher.h"
+#include "llvm/ADT/StringRef.h"
+#include "gtest/gtest.h"
+#include <optional>
+
+using namespace lldb_dap;
+using namespace llvm;
+
+TEST(ClientLauncherTest, GetClientFromVSCode) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("vscode");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromVSCodeUpperCase) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("VSCODE");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromVSCodeMixedCase) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("VSCode");
+  ASSERT_TRUE(result.has_value());
+  EXPECT_EQ(ClientLauncher::VSCode, result.value());
+}
+
+TEST(ClientLauncherTest, GetClientFromInvalidString) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("invalid");
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(ClientLauncherTest, GetClientFromEmptyString) {
+  std::optional<ClientLauncher::Client> result =
+      ClientLauncher::GetClientFrom("");
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(ClientLauncherTest, URLEncode) {
+  EXPECT_EQ("", VSCodeLauncher::URLEncode(""));
+  EXPECT_EQ(
+      "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_.~",
+      VSCodeLauncher::URLEncode("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRST"
+                                "UVWXYZ0123456789-_.~"));
+  EXPECT_EQ("hello%20world", VSCodeLauncher::URLEncode("hello world"));
+  EXPECT_EQ("hello%21%40%23%24", VSCodeLauncher::URLEncode("hello!@#$"));
+  EXPECT_EQ("%2Fpath%2Fto%2Ffile", VSCodeLauncher::URLEncode("/path/to/file"));
+  EXPECT_EQ("key%3Dvalue%26key2%3Dvalue2",
+            VSCodeLauncher::URLEncode("key=value&key2=value2"));
+  EXPECT_EQ("100%25complete", VSCodeLauncher::URLEncode("100%complete"));
+  EXPECT_EQ("file_name%20with%20spaces%20%26%20special%21.txt",
+            VSCodeLauncher::URLEncode("file_name with spaces & special!.txt"));
+  EXPECT_EQ("%00%01%02",
+            VSCodeLauncher::URLEncode(llvm::StringRef("\x00\x01\x02", 3)));
+  EXPECT_EQ("test-file_name.txt~",
+            VSCodeLauncher::URLEncode("test-file_name.txt~"));
+
+  // UTF-8 encoded characters should be percent-encoded byte by byte.
+  EXPECT_EQ("%C3%A9", VSCodeLauncher::URLEncode("é"));
+}
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index c450ee5a3d72e..f192cd05b5a34 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1264,10 +1264,10 @@ endif()
 # Build with _XOPEN_SOURCE on z/OS.
 if (CMAKE_SYSTEM_NAME MATCHES "OS390")
   add_compile_definitions(_XOPEN_SOURCE=600)
+  add_compile_definitions(_XPLATFORM_SOURCE) # Needed e.g. for O_CLOEXEC.
   add_compile_definitions(_OPEN_SYS) # Needed for process information.
   add_compile_definitions(_OPEN_SYS_FILE_EXT) # Needed for EBCDIC I/O.
   add_compile_definitions(_EXT) # Needed for file data.
-  add_compile_definitions(_UNIX03_THREADS) # Multithreading support.
   # Need to build LLVM as ASCII application.
   # This can't be a global setting because other projects may
   # need to be built in EBCDIC mode.
diff --git a/llvm/cmake/modules/CrossCompile.cmake b/llvm/cmake/modules/CrossCompile.cmake
index bfbd9cfd4063f..2a69c5133c56f 100644
--- a/llvm/cmake/modules/CrossCompile.cmake
+++ b/llvm/cmake/modules/CrossCompile.cmake
@@ -101,6 +101,7 @@ function(llvm_create_cross_target project_name target_name toolchain buildtype)
         -DLLVM_INCLUDE_BENCHMARKS=OFF
         -DLLVM_INCLUDE_TESTS=OFF
         -DLLVM_TABLEGEN_FLAGS="${LLVM_TABLEGEN_FLAGS}"
+        -DPYTHON_EXECUTABLE="${PYTHON_EXECUTABLE}"
         ${build_type_flags} ${linker_flag} ${external_clang_dir} ${libc_flags}
         ${ARGN}
     WORKING_DIRECTORY ${${project_name}_${target_name}_BUILD}
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 7780c0a6dca0a..30b22a4a6d607 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -883,8 +883,9 @@ supported for the ``amdgcn`` target.
      Buffer Fat Pointer                    7               N/A         N/A              160     0
      Buffer Resource                       8               N/A         V#               128     0x00000000000000000000000000000000
      Buffer Strided Pointer (experimental) 9               *TODO*
-     *reserved for downstream use*         10
-     *reserved for downstream use*         11
+     *reserved for future use*             10
+     *reserved for future use*             11
+     *reserved for downstream use (LLPC)*  12
      Streamout Registers                   128             N/A         GS_REGS
      ===================================== =============== =========== ================ ======= ============================
 
diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index 8e61e01d7d9c3..0e442d657e987 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -70,6 +70,14 @@ OPTIONS
 
  Print this help output.
 
+.. option:: --include-swiftmodules-from-interface
+
+ Whether or not to copy binary swiftmodules built from textual .swiftinterface
+ files into the dSYM bundle. These typically come only from the SDK (since
+ textual interfaces require library evolution) and thus are a waste of space to
+ copy into the bundle. Turn this on if the swiftmodules are different from
+ those in the SDK.
+
 .. option:: --keep-function-for-static
 
  Make a static variable keep the enclosing function even if it would have been
diff --git a/llvm/docs/CommandGuide/llvm-dwarfdump.rst b/llvm/docs/CommandGuide/llvm-dwarfdump.rst
index 137830259eb64..dfc0431f07826 100644
--- a/llvm/docs/CommandGuide/llvm-dwarfdump.rst
+++ b/llvm/docs/CommandGuide/llvm-dwarfdump.rst
@@ -134,6 +134,15 @@ OPTIONS
 
             Abbreviate the description of type unit entries.
 
+.. option:: -t, --filter-child-tag
+
+            Only dump children whose DWARF tag is one of the specified tags.
+            Example usage:
+
+            .. code-block:: c
+
+              llvm-dwarfdump -t DW_TAG_structure_type -t DW_TAG_member -c
+
 .. option:: -x, --regex
 
             Treat any <name> strings as regular expressions when searching
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3c089b5a0ba79..b9507a2d054fe 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2741,6 +2741,11 @@ For example:
 ``"nooutline"``
     This attribute indicates that outlining passes should not modify the
     function.
+``nocreateundeforpoison``
+    This attribute indicates that the result of the function (prior to
+    application of return attributes/metadata) will not be undef or poison if
+    all arguments are not undef and not poison. Otherwise, it is undefined
+    behavior.
 
 Call Site Attributes
 ----------------------
diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst
index 3f4c3cde9b3aa..f7647c898c1e6 100644
--- a/llvm/docs/MIRLangRef.rst
+++ b/llvm/docs/MIRLangRef.rst
@@ -86,25 +86,25 @@ Tests are more accessible and future proof when simplified:
 - Use the ``-simplify-mir`` option with llc.
 
 - Machine function attributes often have default values or the test works just
-  as well with default values. Typical candidates for this are: `alignment:`,
-  `exposesReturnsTwice`, `legalized`, `regBankSelected`, `selected`.
+  as well with default values. Typical candidates for this are: ``alignment:``,
+  ``exposesReturnsTwice``, ``legalized``, ``regBankSelected``, ``selected``.
   The whole `frameInfo` section is often unnecessary if there is no special
-  frame usage in the function. `tracksRegLiveness` on the other hand is often
+  frame usage in the function. ``tracksRegLiveness`` on the other hand is often
   necessary for some passes that care about block livein lists.
 
-- The (global) `liveins:` list is typically only interesting for early
+- The (global) ``liveins:`` list is typically only interesting for early
   instruction selection passes and can be removed when testing later passes.
-  The per-block `liveins:` on the other hand are necessary if
+  The per-block ``liveins:`` on the other hand are necessary if
   `tracksRegLiveness` is true.
 
-- Branch probability data in block `successors:` lists can be dropped if the
+- Branch probability data in block ``successors:`` lists can be dropped if the
   test doesn't depend on it. Example:
-  `successors: %bb.1(0x40000000), %bb.2(0x40000000)` can be replaced with
-  `successors: %bb.1, %bb.2`.
+  ``successors: %bb.1(0x40000000), %bb.2(0x40000000)`` can be replaced with
+  ``successors: %bb.1, %bb.2``.
 
 - MIR code contains a whole IR module. This is necessary because there are
   no equivalents in MIR for global variables, references to external functions,
-  function attributes, metadata, debug info. Instead some MIR data references
+  function attributes, metadata, debug info. Instead, some MIR data references
   the IR constructs. You can often remove them if the test doesn't depend on
   them.
 
@@ -114,16 +114,16 @@ Tests are more accessible and future proof when simplified:
   dropped: `:: (load 8)`
 
 - MIR blocks can reference IR blocks for debug printing, profile information,
-  or debug locations. Example: `bb.42.myblock` in MIR references the IR block
-  `myblock`. It is usually possible to drop the `.myblock` reference and simply
-  use `bb.42`.
+  or debug locations. Example: ``bb.42.myblock`` in MIR references the IR block
+  ``myblock``. It is usually possible to drop the ``.myblock`` reference and simply
+  use ``bb.42``.
 
 - If there are no memory operands or blocks referencing the IR, then the
   IR function can be replaced by a parameterless dummy function like
-  `define @func() { ret void }`.
+  ``define @func() { ret void }``.
 
 - It is possible to drop the whole IR section of the MIR file if it only
-  contains dummy functions (see above). The .mir loader will create the
+  contains dummy functions (see above). The ``.mir`` loader will create the
   IR functions automatically in this case.
 
 .. _limitations:
@@ -131,7 +131,7 @@ Tests are more accessible and future proof when simplified:
 Limitations
 -----------
 
-Currently the MIR format has several limitations in terms of which state it
+Currently, the MIR format has several limitations in terms of which state it
 can serialize:
 
 - The target-specific state in the target-specific ``MachineFunctionInfo``
@@ -150,7 +150,7 @@ These limitations impose restrictions on what you can test with the MIR format.
 For now, tests that would like to test some behaviour that depends on the state
 of temporary or local ``MCSymbol``  operands or the exception handling state in
 MMI, can't use the MIR format. As well as that, tests that test some behaviour
-that depends on the state of the target specific ``MachineFunctionInfo`` or
+that depends on the state of the target-specific ``MachineFunctionInfo`` or
 ``MachineConstantPoolValue`` subclasses can't use the MIR format at the moment.
 
 High Level Structure
@@ -286,7 +286,7 @@ Example:
 Successors
 ^^^^^^^^^^
 
-The machine basic block's successors have to be specified before any of the
+The machine basic block's successors must be specified before any of the
 instructions:
 
 .. code-block:: text
@@ -489,13 +489,13 @@ In case this is true, the Machine Operand is printed according to the target.
 
 For example:
 
-In AArch64RegisterInfo.td:
+In ``AArch64RegisterInfo.td``:
 
 .. code-block:: text
 
   def sub_32 : SubRegIndex<32>;
 
-If the third operand is an immediate with the value ``15`` (target-dependent
+If the third operand is an immediate with the value ``15`` (a target-dependent
 value), based on the instruction's opcode and the operand's index the operand
 will be printed as ``%subreg.sub_32``:
 
@@ -503,7 +503,7 @@ will be printed as ``%subreg.sub_32``:
 
     %1:gpr64 = SUBREG_TO_REG 0, %0, %subreg.sub_32
 
-For integers > 64 bits, we use a special machine operand, ``MO_CImmediate``,
+For integers larger than 64 bits, we use a special machine operand, ``MO_CImmediate``,
 which stores the immediate in a ``ConstantInt`` using an ``APInt`` (LLVM's
 arbitrary-precision integers).
 
@@ -552,7 +552,7 @@ corresponding internal ``llvm::RegState`` representation:
 
    * - ``implicit``
      - ``RegState::Implicit``
-     - Not emitted register (e.g. carry, or temporary result).
+     - Not emitted register (e.g., carry, or temporary result).
 
    * - ``implicit-def``
      - ``RegState::ImplicitDefine``
@@ -625,7 +625,7 @@ For a CPI with the index 0 and offset -12:
 
     %1:gr64 = MOV64ri %const.0 - 12
 
-A constant pool entry is bound to a LLVM IR ``Constant`` or a target-specific
+A constant pool entry is bound to an LLVM IR ``Constant`` or a target-specific
 ``MachineConstantPoolValue``. When serializing all the function's constants, the
 following format is used:
 
@@ -670,12 +670,12 @@ a global value operand named ``G``:
 
     $rax = MOV64rm $rip, 1, _, @G, _
 
-The named global values are represented using an identifier with the '@' prefix.
+The named global values are represented using an identifier with the ``@`` prefix.
 If the identifier doesn't match the regular expression
-`[-a-zA-Z$._][-a-zA-Z$._0-9]*`, then this identifier must be quoted.
+``[-a-zA-Z$._][-a-zA-Z$._0-9]*``, then this identifier must be quoted.
 
 The unnamed global values are represented using an unsigned numeric value with
-the '@' prefix, like in the following examples: ``@0``, ``@989``.
+the ``@`` prefix, as in the following examples: ``@0``, ``@989``.
 
 Target-dependent Index Operands
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -741,7 +741,7 @@ Example:
 MCSymbol Operands
 ^^^^^^^^^^^^^^^^^
 
-A MCSymbol operand holds a pointer to a ``MCSymbol``. For the limitations
+An ``MCSymbol`` operand holds a pointer to an ``MCSymbol``. For the limitations
 of this operand in MIR, see :ref:`limitations <limitations>`.
 
 The syntax is:
@@ -825,7 +825,7 @@ Comments
 ^^^^^^^^
 
 Machine operands can have C/C++ style comments, which are annotations enclosed
-between ``/*`` and ``*/`` to improve readability of e.g. immediate operands.
+between ``/*`` and ``*/`` to improve readability of e.g., immediate operands.
 In the example below, ARM instructions EOR and BCC and immediate operands
 ``14`` and ``0`` have been annotated with their condition codes (CC)
 definitions, i.e. the ``always`` and ``eq`` condition codes:
@@ -920,7 +920,7 @@ Instruction referencing locations
 
 This experimental feature aims to separate the specification of variable
 *values* from the program point where a variable takes on that value. Changes
-in variable value occur in the same manner as ``DBG_VALUE`` meta instructions
+in a variable value occur in the same manner as ``DBG_VALUE`` meta instructions
 but using ``DBG_INSTR_REF``. Variable values are identified by a pair of
 instruction number and operand number. Consider the example below:
 
diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst
index d43b9c3a89091..d64c846687bae 100644
--- a/llvm/docs/MergeFunctions.rst
+++ b/llvm/docs/MergeFunctions.rst
@@ -8,9 +8,9 @@ MergeFunctions pass, how it works
 Introduction
 ============
 Sometimes code contains equal functions, or functions that do exactly the same
-thing even though they are non-equal on the IR level (e.g.: multiplication on 2
-and 'shl 1'). This can happen for several reasons: mainly, the usage of
-templates and automatic code generators. Though, sometimes the user itself could
+thing even though they are non-equal on the IR level (e.g.,: multiplication on 2
+and ``shl 1``). This can happen for several reasons: mainly, the usage of
+templates and automatic code generators. However, sometimes the user itself could
 write the same thing twice :-)
 
 The main purpose of this pass is to recognize such functions and merge them.
@@ -20,21 +20,21 @@ describes the algorithm used to compare functions and
 explains how we could combine equal functions correctly to keep the module
 valid.
 
-Material is brought in a top-down form, so the reader could start to learn pass
+The material is presented in a top-down form, so the reader could start to learn pass
 from high level ideas and end with low-level algorithm details, thus preparing
 him or her for reading the sources.
 
 The main goal is to describe the algorithm and logic here and the concept. If
 you *don't want* to read the source code, but want to understand pass
 algorithms, this document is good for you. The author tries not to repeat the
-source-code and covers only common cases to avoid the cases of needing to
+source code and covers only common cases to avoid the cases of needing to
 update this document after any minor code changes.
 
 
 What should I know to be able to follow along with this document?
 -----------------------------------------------------------------
 
-The reader should be familiar with common compile-engineering principles and
+The reader should be familiar with common compiler-engineering principles and
 LLVM code fundamentals. In this article, we assume the reader is familiar with
 `Single Static Assignment
 <http://en.wikipedia.org/wiki/Static_single_assignment_form>`_
@@ -99,7 +99,7 @@ and a ``void*`` as equal.
 This is just an example; more possible details are described a bit below.
 
 As another example, the reader may imagine two more functions. The first
-function performs a multiplication by 2, while the second one performs an
+function performs a multiplication by 2, while the second one performs a
 logical left shift by 1.
 
 Possible solutions
@@ -131,7 +131,7 @@ access lookup? The answer is: "yes".
 Random-access
 """""""""""""
 How can this be done? Just convert each function to a number, and gather
-all of them in a special hash-table. Functions with equal hashes are equal.
+all of them in a special hash table. Functions with equal hashes are equal.
 Good hashing means, that every function part must be taken into account. That
 means we have to convert every function part into some number, and then add it
 into the hash. The lookup-up time would be small, but such an approach adds some
@@ -175,7 +175,7 @@ merged with each other. It is defined as:
 
 ``std::set<FunctionNode> FnTree;``
 
-Here ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with
+Here, ``FunctionNode`` is a wrapper for ``llvm::Function`` class, with an
 implemented “<” operator among the functions set (below we explain how it works
 exactly; this is a key point in fast functions comparison).
 
@@ -207,7 +207,7 @@ from method.
 Comparison and logarithmical search
 """""""""""""""""""""""""""""""""""
 Let's recall our task: for every function *F* from module *M*, we have to find
-equal functions *F`* in the shortest time possible , and merge them into a
+equal functions *F`* in the shortest time possible and merge them into a
 single function.
 
 Defining total ordering among the functions set allows us to organize
@@ -225,7 +225,7 @@ possible values:
 
 1, left is *greater* than right.
 
-Of course it means, that we have to maintain
+Of course, it means that we have to maintain
 *strict and non-strict order relation properties*:
 
 * reflexivity (``a <= a``, ``a == a``, ``a >= a``),
@@ -235,7 +235,7 @@ Of course it means, that we have to maintain
 
 As mentioned before, the comparison routine consists of
 "sub-comparison-routines", with each of them also consisting of
-"sub-comparison-routines", and so on. Finally, it ends up with primitive
+"sub-comparison-routines", and so on. Finally, it ends up with a primitive
 comparison.
 
 Below, we will use the following operations:
@@ -275,7 +275,7 @@ A brief look at the source code tells us that the comparison starts in the
 “``int FunctionComparator::compare(void)``” method.
 
 1. The first parts to be compared are the function's attributes and some
-properties that is outside the “attributes” term, but still could make the
+properties that are outside the “attributes” term, but still could make the
 function different without changing its body. This part of the comparison is
 usually done within simple *cmpNumbers* or *cmpFlags* operations (e.g.
 ``cmpFlags(F1->hasGC(), F2->hasGC())``). Below is a full list of function's
@@ -365,7 +365,7 @@ comparing them as numbers.
 7. Complex types (structures, arrays, etc.). Follow complex objects comparison
 technique (see the very first paragraph of this chapter). Both *left* and
 *right* are to be expanded and their element types will be checked the same
-way. If we get -1 or 1 on some stage, return it. Otherwise return 0.
+way. If we get -1 or 1 on some stage, return it. Otherwise, return 0.
 
 8. Steps 1-6 describe all the possible cases, if we passed steps 1-6 and didn't
 get any conclusions, then invoke ``llvm_unreachable``, since it's quite an
@@ -445,7 +445,7 @@ How to implement cmpValues?
 but, in general, we need to implement antisymmetric relation. As mentioned
 above, to understand what is *less*, we can use order in which we
 meet values. If both values have the same order in a function (met at the same
-time), we then treat values as *associated*. Otherwise – it depends on who was
+time), we then treat values as *associated*. Otherwise, it depends on who was
 first.
 
 Every time we run the top-level compare method, we initialize two identical
@@ -623,7 +623,7 @@ to use ``accumulateConstantOffset`` method.
 So, if we get constant offset for both left and right *GEPs*, then compare it as
 numbers, and return comparison result.
 
-Otherwise treat it like a regular operation (see previous paragraph).
+Otherwise, treat it like a regular operation (see previous paragraph).
 
 cmpOperation
 ------------
@@ -742,7 +742,7 @@ We call ``writeThunkOrAlias(Function *F, Function *G)``. Here we try to replace
   referenced anywhere,
 * function should come with external, local or weak linkage.
 
-Otherwise we write thunk: some wrapper that has *G's* interface and calls *F*,
+Otherwise, we write thunk: some wrapper that has *G's* interface and calls *F*,
 so *G* could be replaced with this wrapper.
 
 *writeAlias*
@@ -772,7 +772,7 @@ As it written in method comments:
 “Replace G with a simple tail call to bitcast(F). Also replace direct uses of G
 with bitcast(F). Deletes G.”
 
-In general it does the same as usual when we want to replace callee, except the
+In general, it does the same as usual when we want to replace callee, except the
 first point:
 
 1. We generate tail call wrapper around *F*, but with an interface that allows using
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 49184e3104868..d03f383a92b3b 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -406,6 +406,12 @@ The current vendor extensions supported are:
 ``XSfvcp``
   LLVM implements `version 1.1.0 of the SiFive Vector Coprocessor Interface (VCIX) Software Specification <https://sifive.cdn.prismic.io/sifive/Zn3m1R5LeNNTwnLS_vcix-spec-software-v1p1.pdf>`__ by SiFive.  All instructions are prefixed with `sf.vc.` as described in the specification, and the riscv-toolchain-convention document linked above.
 
+``Xsfvfexp16e``, ``Xsfvfbfexp16e``, and ``Xsfvfexp32e``
+  LLVM implements `version 0.5 of the Vector Exponential Extension Specification <https://www.sifive.com/document-file/exponential-function-instruction-xsfvfexp32e-xsfvf>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above.
+
+``Xsfvfexpa`` and ``Xsfvfexpa64e``
+  LLVM implements `version 0.2 of the Vector Exponential Approximation Extension Specification <https://www.sifive.com/document-file/exponential-approximation-instruction-xsfvfexpa-ex>`__ by SiFive. All instructions are prefixed with `sf.` as described in the specification linked above.
+
 ``XSfvqmaccdod``, ``XSfvqmaccqoq``
   LLVM implements `version 1.1.0 of the SiFive Int8 Matrix Multiplication Extensions Specification <https://sifive.cdn.prismic.io/sifive/1a2ad85b-d818-49f7-ba83-f51f1731edbe_int8-matmul-spec.pdf>`__ by SiFive.  All instructions are prefixed with `sf.` as described in the specification linked above.
 
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 85eeabf10244a..749961356d23e 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -173,6 +173,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Allows generating arbitrary width integer types.
    * - ``SPV_INTEL_bindless_images``
      - Adds instructions to convert convert unsigned integer handles to images, samplers and sampled images.
+   * - ``SPV_INTEL_bfloat16_arithmetic``
+     - Allows the use of 16-bit bfloat16 values in arithmetic and relational operators.
    * - ``SPV_INTEL_bfloat16_conversion``
      - Adds instructions to convert between single-precision 32-bit floating-point values and 16-bit bfloat16 values.
    * - ``SPV_INTEL_cache_controls``
@@ -187,6 +189,8 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
      - Adds decorations that can be applied to global (module scope) variables.
    * - ``SPV_INTEL_global_variable_fpga_decorations``
      - Adds decorations that can be applied to global (module scope) variables to help code generation for FPGA devices.
+   * - ``SPV_INTEL_kernel_attributes``
+     - Adds execution modes that can be applied to entry points to inform scheduling.
    * - ``SPV_INTEL_media_block_io``
      - Adds additional subgroup block read and write functionality that allow applications to flexibly specify the width and height of the block to read from or write to a 2D image.
    * - ``SPV_INTEL_memory_access_aliasing``
@@ -226,9 +230,9 @@ Below is a list of supported SPIR-V extensions, sorted alphabetically by their e
    * - ``SPV_INTEL_fp_max_error``
      - Adds the ability to specify the maximum error for floating-point operations.
    * - ``SPV_INTEL_ternary_bitwise_function``
-     - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform. 
+     - Adds a bitwise instruction on three operands and a look-up table index for specifying the bitwise operation to perform.
    * - ``SPV_INTEL_subgroup_matrix_multiply_accumulate``
-     - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix. 
+     - Adds an instruction to compute the matrix product of an M x K matrix with a K x N matrix and then add an M x N matrix.
    * - ``SPV_INTEL_int4``
      - Adds support for 4-bit integer type, and allow this type to be used in cooperative matrices.
    * - ``SPV_KHR_float_controls2``
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index bccdb8930561e..82ac9a3a1ef80 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -152,7 +152,7 @@ class APFloatBase {
   static constexpr unsigned integerPartWidth = APInt::APINT_BITS_PER_WORD;
 
   /// A signed type to represent a floating point numbers unbiased exponent.
-  typedef int32_t ExponentType;
+  using ExponentType = int32_t;
 
   /// \name Floating Point Semantics.
   /// @{
@@ -938,8 +938,8 @@ LLVM_ABI DoubleAPFloat frexp(const DoubleAPFloat &X, int &Exp, roundingMode);
 // This is a interface class that is currently forwarding functionalities from
 // detail::IEEEFloat.
 class APFloat : public APFloatBase {
-  typedef detail::IEEEFloat IEEEFloat;
-  typedef detail::DoubleAPFloat DoubleAPFloat;
+  using IEEEFloat = detail::IEEEFloat;
+  using DoubleAPFloat = detail::DoubleAPFloat;
 
   static_assert(std::is_standard_layout<IEEEFloat>::value);
 
diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
index 9fa98ad4ddde1..fdb3b84b73a1f 100644
--- a/llvm/include/llvm/ADT/APInt.h
+++ b/llvm/include/llvm/ADT/APInt.h
@@ -77,7 +77,7 @@ inline APInt operator-(APInt);
 ///
 class [[nodiscard]] APInt {
 public:
-  typedef uint64_t WordType;
+  using WordType = uint64_t;
 
   /// Byte size of a word.
   static constexpr unsigned APINT_WORD_SIZE = sizeof(WordType);
@@ -154,6 +154,7 @@ class [[nodiscard]] APInt {
   /// Once all uses of this constructor are migrated to other constructors,
   /// consider marking this overload ""= delete" to prevent calls from being
   /// incorrectly bound to the APInt(unsigned, uint64_t, bool) constructor.
+  [[deprecated("Use other constructors of APInt")]]
   LLVM_ABI APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]);
 
   /// Construct an APInt from a string representation.
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index 9e81a4b735e7f..cc3f3a9226395 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -99,7 +99,7 @@ template <typename BitVectorT> class const_set_bits_iterator_impl {
 };
 
 class BitVector {
-  typedef uintptr_t BitWord;
+  using BitWord = uintptr_t;
 
   enum { BITWORD_SIZE = (unsigned)sizeof(BitWord) * CHAR_BIT };
 
@@ -147,8 +147,8 @@ class BitVector {
     }
   };
 
-  typedef const_set_bits_iterator_impl<BitVector> const_set_bits_iterator;
-  typedef const_set_bits_iterator set_iterator;
+  using const_set_bits_iterator = const_set_bits_iterator_impl<BitVector>;
+  using set_iterator = const_set_bits_iterator;
 
   const_set_bits_iterator set_bits_begin() const {
     return const_set_bits_iterator(*this);
diff --git a/llvm/include/llvm/ADT/GenericSSAContext.h b/llvm/include/llvm/ADT/GenericSSAContext.h
index e9f99bafe9f1e..426a083778d6e 100644
--- a/llvm/include/llvm/ADT/GenericSSAContext.h
+++ b/llvm/include/llvm/ADT/GenericSSAContext.h
@@ -25,7 +25,7 @@ template <typename, bool> class DominatorTreeBase;
 template <typename> class SmallVectorImpl;
 
 namespace Intrinsic {
-typedef unsigned ID;
+using ID = unsigned;
 }
 
 // Specializations of this template should provide the types used by the
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index a9841c6651b72..af0e4a36be1b1 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1516,8 +1516,8 @@ template <class Iterator, class RNG>
 void shuffle(Iterator first, Iterator last, RNG &&g) {
   // It would be better to use a std::uniform_int_distribution,
   // but that would be stdlib dependent.
-  typedef
-      typename std::iterator_traits<Iterator>::difference_type difference_type;
+  using difference_type =
+      typename std::iterator_traits<Iterator>::difference_type;
   for (auto size = last - first; size > 1; ++first, (void)--size) {
     difference_type offset = g() % size;
     // Avoid self-assignment due to incorrect assertions in libstdc++
@@ -2600,16 +2600,6 @@ bool hasNItemsOrLess(ContainerTy &&C, unsigned N) {
   return hasNItemsOrLess(adl_begin(C), adl_end(C), N);
 }
 
-/// Returns a raw pointer that represents the same address as the argument.
-///
-/// This implementation can be removed once we move to C++20 where it's defined
-/// as std::to_address().
-///
-/// The std::pointer_traits<>::to_address(p) variations of these overloads has
-/// not been implemented.
-template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
-template <class T> constexpr T *to_address(T *P) { return P; }
-
 // Detect incomplete types, relying on the fact that their size is unknown.
 namespace detail {
 template <typename T> using has_sizeof = decltype(sizeof(T));
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index e02694f043fbb..ad94cdede9288 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -134,6 +134,16 @@ struct identity // NOLINT(readability-identifier-naming)
   }
 };
 
+/// Returns a raw pointer that represents the same address as the argument.
+///
+/// This implementation can be removed once we move to C++20 where it's defined
+/// as std::to_address().
+///
+/// The std::pointer_traits<>::to_address(p) variations of these overloads has
+/// not been implemented.
+template <class Ptr> auto to_address(const Ptr &P) { return P.operator->(); }
+template <class T> constexpr T *to_address(T *P) { return P; }
+
 //===----------------------------------------------------------------------===//
 //     Features from C++23
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 8c8d31bd4f055..53ebec1eb3a54 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -14,7 +14,6 @@
 #define LLVM_ADT_STRINGSWITCH_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 #include <cstring>
@@ -42,6 +41,8 @@ namespace llvm {
 ///   .Cases({"violet", "purple"}, Violet)
 ///   .Default(UnknownColor);
 /// \endcode
+///
+/// When multiple matches are found, the value of the first match is returned.
 template<typename T, typename R = T>
 class StringSwitch {
   /// The string we are matching.
@@ -64,7 +65,7 @@ class StringSwitch {
   void operator=(const StringSwitch &) = delete;
   void operator=(StringSwitch &&) = delete;
 
-  // Case-sensitive case matchers
+  // Case-sensitive case matchers.
   StringSwitch &Case(StringLiteral S, T Value) {
     CaseImpl(S, Value);
     return *this;
@@ -214,23 +215,30 @@ class StringSwitch {
   [[nodiscard]] operator R() { return DefaultUnreachable(); }
 
 private:
-  // Returns true when `Str` matches the `S` argument, and stores the result.
+  // Returns true when a match is found. If `Str` matches the `S` argument,
+  // stores the result.
   bool CaseImpl(StringLiteral S, T &Value) {
-    if (!Result && Str == S) {
-      Result = std::move(Value);
+    if (Result)
       return true;
-    }
-    return false;
+
+    if (Str != S)
+      return false;
+
+    Result = std::move(Value);
+    return true;
   }
 
-  // Returns true when `Str` matches the `S` argument (case-insensitive), and
-  // stores the result.
+  // Returns true when a match is found. If `Str` matches the `S` argument
+  // (case-insensitive), stores the result.
   bool CaseLowerImpl(StringLiteral S, T &Value) {
-    if (!Result && Str.equals_insensitive(S)) {
-      Result = std::move(Value);
+    if (Result)
       return true;
-    }
-    return false;
+
+    if (!Str.equals_insensitive(S))
+      return false;
+
+    Result = std::move(Value);
+    return true;
   }
 
   StringSwitch &CasesImpl(std::initializer_list<StringLiteral> Cases,
diff --git a/llvm/include/llvm/ADT/ilist.h b/llvm/include/llvm/ADT/ilist.h
index aed19ccbff7f2..64392903bec74 100644
--- a/llvm/include/llvm/ADT/ilist.h
+++ b/llvm/include/llvm/ADT/ilist.h
@@ -108,21 +108,21 @@ template <typename Ty> struct ilist_traits<const Ty> {};
 /// list.
 template <class IntrusiveListT, class TraitsT>
 class iplist_impl : public TraitsT, IntrusiveListT {
-  typedef IntrusiveListT base_list_type;
+  using base_list_type = IntrusiveListT;
 
 public:
-  typedef typename base_list_type::pointer pointer;
-  typedef typename base_list_type::const_pointer const_pointer;
-  typedef typename base_list_type::reference reference;
-  typedef typename base_list_type::const_reference const_reference;
-  typedef typename base_list_type::value_type value_type;
-  typedef typename base_list_type::size_type size_type;
-  typedef typename base_list_type::difference_type difference_type;
-  typedef typename base_list_type::iterator iterator;
-  typedef typename base_list_type::const_iterator const_iterator;
-  typedef typename base_list_type::reverse_iterator reverse_iterator;
-  typedef
-      typename base_list_type::const_reverse_iterator const_reverse_iterator;
+  using pointer = typename base_list_type::pointer;
+  using const_pointer = typename base_list_type::const_pointer;
+  using reference = typename base_list_type::reference;
+  using const_reference = typename base_list_type::const_reference;
+  using value_type = typename base_list_type::value_type;
+  using size_type = typename base_list_type::size_type;
+  using difference_type = typename base_list_type::difference_type;
+  using iterator = typename base_list_type::iterator;
+  using const_iterator = typename base_list_type::const_iterator;
+  using reverse_iterator = typename base_list_type::reverse_iterator;
+  using const_reverse_iterator =
+      typename base_list_type::const_reverse_iterator;
 
 private:
   static bool op_less(const_reference L, const_reference R) { return L < R; }
diff --git a/llvm/include/llvm/ADT/ilist_node_options.h b/llvm/include/llvm/ADT/ilist_node_options.h
index 003d5dabce897..53719b07a3768 100644
--- a/llvm/include/llvm/ADT/ilist_node_options.h
+++ b/llvm/include/llvm/ADT/ilist_node_options.h
@@ -58,8 +58,8 @@ namespace ilist_detail {
 template <bool IsExplicit> struct explicitness {
   static const bool is_explicit = IsExplicit;
 };
-typedef explicitness<true> is_explicit;
-typedef explicitness<false> is_implicit;
+using is_explicit = explicitness<true>;
+using is_implicit = explicitness<false>;
 
 /// Check whether an option is valid.
 ///
@@ -103,12 +103,12 @@ struct is_valid_option<ilist_sentinel_tracking<EnableSentinelTracking>>
 template <class... Options> struct extract_tag;
 template <class Tag, class... Options>
 struct extract_tag<ilist_tag<Tag>, Options...> {
-  typedef Tag type;
+  using type = Tag;
 };
 template <class Option1, class... Options>
 struct extract_tag<Option1, Options...> : extract_tag<Options...> {};
 template <> struct extract_tag<> {
-  typedef void type;
+  using type = void;
 };
 template <class Tag> struct is_valid_option<ilist_tag<Tag>> : std::true_type {};
 
@@ -134,11 +134,13 @@ struct is_valid_option<ilist_iterator_bits<IteratorBits>> : std::true_type {};
 template <class... Options> struct extract_parent;
 template <class ParentTy, class... Options>
 struct extract_parent<ilist_parent<ParentTy>, Options...> {
-  typedef ParentTy type;
+  using type = ParentTy;
 };
 template <class Option1, class... Options>
 struct extract_parent<Option1, Options...> : extract_parent<Options...> {};
-template <> struct extract_parent<> { typedef void type; };
+template <> struct extract_parent<> {
+  using type = void;
+};
 template <class ParentTy>
 struct is_valid_option<ilist_parent<ParentTy>> : std::true_type {};
 
@@ -154,28 +156,27 @@ struct check_options : std::conjunction<is_valid_option<Options>...> {};
 template <class T, bool EnableSentinelTracking, bool IsSentinelTrackingExplicit,
           class TagT, bool HasIteratorBits, class ParentTy>
 struct node_options {
-  typedef T value_type;
-  typedef T *pointer;
-  typedef T &reference;
-  typedef const T *const_pointer;
-  typedef const T &const_reference;
+  using value_type = T;
+  using pointer = T *;
+  using reference = T &;
+  using const_pointer = const T *;
+  using const_reference = const T &;
 
   static const bool enable_sentinel_tracking = EnableSentinelTracking;
   static const bool is_sentinel_tracking_explicit = IsSentinelTrackingExplicit;
   static const bool has_iterator_bits = HasIteratorBits;
-  typedef TagT tag;
-  typedef ParentTy parent_ty;
-  typedef ilist_node_base<enable_sentinel_tracking, parent_ty> node_base_type;
-  typedef ilist_base<enable_sentinel_tracking, parent_ty> list_base_type;
+  using tag = TagT;
+  using parent_ty = ParentTy;
+  using node_base_type = ilist_node_base<enable_sentinel_tracking, parent_ty>;
+  using list_base_type = ilist_base<enable_sentinel_tracking, parent_ty>;
 };
 
 template <class T, class... Options> struct compute_node_options {
-  typedef node_options<T, extract_sentinel_tracking<Options...>::value,
-                       extract_sentinel_tracking<Options...>::is_explicit,
-                       typename extract_tag<Options...>::type,
-                       extract_iterator_bits<Options...>::value,
-                       typename extract_parent<Options...>::type>
-      type;
+  using type = node_options<T, extract_sentinel_tracking<Options...>::value,
+                            extract_sentinel_tracking<Options...>::is_explicit,
+                            typename extract_tag<Options...>::type,
+                            extract_iterator_bits<Options...>::value,
+                            typename extract_parent<Options...>::type>;
 };
 
 } // end namespace ilist_detail
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index 84b4ad7c1d5a9..c85ef3e131068 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -893,7 +893,7 @@ replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 /// result of this function is undefined.
 LLVM_ABI std::optional<int64_t>
 getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
-             const Loop *Lp,
+             const Loop *Lp, const DominatorTree &DT,
              const DenseMap<Value *, const SCEV *> &StridesMap =
                  DenseMap<Value *, const SCEV *>(),
              bool Assume = false, bool ShouldCheckWrap = true);
diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h
index af218ba564081..093309cb8bbee 100644
--- a/llvm/include/llvm/Analysis/ValueTracking.h
+++ b/llvm/include/llvm/Analysis/ValueTracking.h
@@ -1024,6 +1024,16 @@ findValuesAffectedByCondition(Value *Cond, bool IsAssume,
 LLVM_ABI Value *stripNullTest(Value *V);
 LLVM_ABI const Value *stripNullTest(const Value *V);
 
+/// Enumerates all possible values of V and inserts them into the set \p
+/// Constants. If \p AllowUndefOrPoison is false, it fails when V may contain
+/// undef/poison elements. Returns true if the result is complete. Otherwise,
+/// the result is incomplete (more than MaxCount values).
+/// NOTE: The constant values are not distinct.
+LLVM_ABI bool
+collectPossibleValues(const Value *V,
+                      SmallPtrSetImpl<const Constant *> &Constants,
+                      unsigned MaxCount, bool AllowUndefOrPoison = true);
+
 } // end namespace llvm
 
 #endif // LLVM_ANALYSIS_VALUETRACKING_H
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 464f475098ec5..b0c5beae631ce 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -801,6 +801,7 @@ enum AttributeKindCodes {
   ATTR_KIND_CAPTURES = 102,
   ATTR_KIND_DEAD_ON_RETURN = 103,
   ATTR_KIND_SANITIZE_ALLOC_TOKEN = 104,
+  ATTR_KIND_NO_CREATE_UNDEF_OR_POISON = 105,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/CAS/ActionCache.h b/llvm/include/llvm/CAS/ActionCache.h
index 69ee4dde1974a..7f5b11223c54d 100644
--- a/llvm/include/llvm/CAS/ActionCache.h
+++ b/llvm/include/llvm/CAS/ActionCache.h
@@ -75,6 +75,9 @@ class ActionCache {
                    CanBeDistributed);
   }
 
+  /// Validate the ActionCache contents.
+  virtual Error validate() const = 0;
+
   virtual ~ActionCache() = default;
 
 protected:
@@ -97,6 +100,9 @@ class ActionCache {
 /// Create an action cache in memory.
 std::unique_ptr<ActionCache> createInMemoryActionCache();
 
+/// Create an action cache on disk.
+Expected<std::unique_ptr<ActionCache>> createOnDiskActionCache(StringRef Path);
+
 } // end namespace llvm::cas
 
 #endif // LLVM_CAS_ACTIONCACHE_H
diff --git a/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
new file mode 100644
index 0000000000000..6c165c421b168
--- /dev/null
+++ b/llvm/include/llvm/CAS/BuiltinUnifiedCASDatabases.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
+#define LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
+
+#include "llvm/Support/Error.h"
+
+namespace llvm::cas {
+
+class ActionCache;
+class ObjectStore;
+
+/// Create on-disk \c ObjectStore and \c ActionCache instances based on
+/// \c ondisk::UnifiedOnDiskCache, with built-in hashing.
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+createOnDiskUnifiedCASDatabases(StringRef Path);
+
+/// Represents the result of validating the contents using
+/// \c validateOnDiskUnifiedCASDatabasesIfNeeded.
+///
+/// Note: invalid results are handled as an \c Error.
+enum class ValidationResult {
+  /// The data is already valid.
+  Valid,
+  /// The data was invalid, but was recovered.
+  Recovered,
+  /// Validation was skipped, as it was not needed.
+  Skipped,
+};
+
+/// Validate the data in \p Path, if needed to ensure correctness.
+///
+/// \param Path directory for the on-disk database.
+/// \param CheckHash Whether to validate hashes match the data.
+/// \param AllowRecovery Whether to automatically recover from invalid data by
+/// marking the files for garbage collection.
+/// \param ForceValidation Whether to force validation to occur even if it
+/// should not be necessary.
+/// \param LLVMCasBinaryPath If provided, validation is performed out-of-process
+/// using the given \c llvm-cas executable which protects against crashes
+/// during validation. Otherwise validation is performed in-process.
+///
+/// \returns \c Valid if the data is already valid, \c Recovered if data
+/// was invalid but has been cleared, \c Skipped if validation is not needed,
+/// or an \c Error if validation cannot be performed or if the data is left
+/// in an invalid state because \p AllowRecovery is false.
+Expected<ValidationResult> validateOnDiskUnifiedCASDatabasesIfNeeded(
+    StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinaryPath);
+
+} // namespace llvm::cas
+
+#endif // LLVM_CAS_BUILTINUNIFIEDCASDATABASES_H
diff --git a/llvm/include/llvm/CAS/ObjectStore.h b/llvm/include/llvm/CAS/ObjectStore.h
index 6db5dd3904095..29950fe9d9029 100644
--- a/llvm/include/llvm/CAS/ObjectStore.h
+++ b/llvm/include/llvm/CAS/ObjectStore.h
@@ -5,6 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declaration of the ObjectStore class.
+///
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CAS_OBJECTSTORE_H
 #define LLVM_CAS_OBJECTSTORE_H
@@ -111,7 +116,10 @@ class ObjectStore {
   virtual Expected<bool> isMaterialized(ObjectRef Ref) const = 0;
 
   /// Validate the underlying object referred by CASID.
-  virtual Error validate(const CASID &ID) = 0;
+  virtual Error validateObject(const CASID &ID) = 0;
+
+  /// Validate the entire ObjectStore.
+  virtual Error validate(bool CheckHash) const = 0;
 
 protected:
   /// Load the object referenced by \p Ref.
@@ -215,9 +223,39 @@ class ObjectStore {
     return Data.size();
   }
 
+  /// Set the size for limiting growth of on-disk storage. This has an effect
+  /// for when the instance is closed.
+  ///
+  /// Implementations may leave this unimplemented.
+  virtual Error setSizeLimit(std::optional<uint64_t> SizeLimit) {
+    return Error::success();
+  }
+
+  /// \returns the storage size of the on-disk CAS data.
+  ///
+  /// Implementations that don't have an implementation for this should return
+  /// \p std::nullopt.
+  virtual Expected<std::optional<uint64_t>> getStorageSize() const {
+    return std::nullopt;
+  }
+
+  /// Prune local storage to reduce its size according to the desired size
+  /// limit. Pruning can happen concurrently with other operations.
+  ///
+  /// Implementations may leave this unimplemented.
+  virtual Error pruneStorageData() { return Error::success(); }
+
   /// Validate the whole node tree.
   Error validateTree(ObjectRef Ref);
 
+  /// Import object from another CAS. This will import the full tree from the
+  /// other CAS.
+  Expected<ObjectRef> importObject(ObjectStore &Upstream, ObjectRef Other);
+
+  /// Print the ObjectStore internals for debugging purpose.
+  virtual void print(raw_ostream &) const {}
+  void dump() const;
+
   /// Get CASContext
   const CASContext &getContext() const { return Context; }
 
@@ -290,8 +328,15 @@ class ObjectProxy {
   ObjectHandle H;
 };
 
+/// Create an in memory CAS.
 std::unique_ptr<ObjectStore> createInMemoryCAS();
 
+/// \returns true if \c LLVM_ENABLE_ONDISK_CAS configuration was enabled.
+bool isOnDiskCASEnabled();
+
+/// Create a persistent on-disk path at \p Path.
+Expected<std::unique_ptr<ObjectStore>> createOnDiskCAS(const Twine &Path);
+
 } // namespace cas
 } // namespace llvm
 
diff --git a/llvm/include/llvm/CAS/OnDiskGraphDB.h b/llvm/include/llvm/CAS/OnDiskGraphDB.h
index 5f0ee0e131c0f..76cc528711b69 100644
--- a/llvm/include/llvm/CAS/OnDiskGraphDB.h
+++ b/llvm/include/llvm/CAS/OnDiskGraphDB.h
@@ -340,13 +340,16 @@ class OnDiskGraphDB {
   /// \param HashByteSize Size for the object digest hash bytes.
   /// \param UpstreamDB Optional on-disk store to be used for faulting-in nodes
   /// if they don't exist in the primary store. The upstream store is only used
-  /// for reading nodes, new nodes are only written to the primary store.
+  /// for reading nodes, new nodes are only written to the primary store. User
+  /// need to make sure \p UpstreamDB outlives current instance of
+  /// OnDiskGraphDB and the common usage is to have an \p UnifiedOnDiskCache to
+  /// manage both.
   /// \param Policy If \p UpstreamDB is provided, controls how nodes are copied
   /// to primary store. This is recorded at creation time and subsequent opens
   /// need to pass the same policy otherwise the \p open will fail.
   static Expected<std::unique_ptr<OnDiskGraphDB>>
   open(StringRef Path, StringRef HashName, unsigned HashByteSize,
-       std::unique_ptr<OnDiskGraphDB> UpstreamDB = nullptr,
+       OnDiskGraphDB *UpstreamDB = nullptr,
        FaultInPolicy Policy = FaultInPolicy::FullTree);
 
   ~OnDiskGraphDB();
@@ -438,8 +441,7 @@ class OnDiskGraphDB {
 
   // Private constructor.
   OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
-                OnDiskDataAllocator DataPool,
-                std::unique_ptr<OnDiskGraphDB> UpstreamDB,
+                OnDiskDataAllocator DataPool, OnDiskGraphDB *UpstreamDB,
                 FaultInPolicy Policy);
 
   /// Mapping from hash to object reference.
@@ -459,7 +461,7 @@ class OnDiskGraphDB {
   std::string RootPath;
 
   /// Optional on-disk store to be used for faulting-in nodes.
-  std::unique_ptr<OnDiskGraphDB> UpstreamDB;
+  OnDiskGraphDB *UpstreamDB = nullptr;
 
   /// The policy used to fault in data from upstream.
   FaultInPolicy FIPolicy;
diff --git a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
index b762518366c21..17ae52f0307fc 100644
--- a/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
+++ b/llvm/include/llvm/CAS/OnDiskKeyValueDB.h
@@ -19,6 +19,8 @@
 
 namespace llvm::cas::ondisk {
 
+class UnifiedOnDiskCache;
+
 /// An on-disk key-value data store with the following properties:
 /// * Keys are fixed length binary hashes with expected normal distribution.
 /// * Values are buffers of the same size, specified at creation time.
@@ -59,9 +61,13 @@ class OnDiskKeyValueDB {
   /// \param KeySize Size for the key hash bytes.
   /// \param ValueName Identifier name for the values.
   /// \param ValueSize Size for the value bytes.
+  /// \param UnifiedCache An optional UnifiedOnDiskCache that manages the size
+  /// and lifetime of the CAS instance and it must owns current initializing
+  /// KeyValueDB after initialized.
   static Expected<std::unique_ptr<OnDiskKeyValueDB>>
   open(StringRef Path, StringRef HashName, unsigned KeySize,
-       StringRef ValueName, size_t ValueSize);
+       StringRef ValueName, size_t ValueSize,
+       UnifiedOnDiskCache *UnifiedCache = nullptr);
 
   using CheckValueT =
       function_ref<Error(FileOffset Offset, ArrayRef<char> Data)>;
@@ -70,11 +76,14 @@ class OnDiskKeyValueDB {
   Error validate(CheckValueT CheckValue) const;
 
 private:
-  OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache)
-      : ValueSize(ValueSize), Cache(std::move(Cache)) {}
+  OnDiskKeyValueDB(size_t ValueSize, OnDiskTrieRawHashMap Cache,
+                   UnifiedOnDiskCache *UnifiedCache)
+      : ValueSize(ValueSize), Cache(std::move(Cache)),
+        UnifiedCache(UnifiedCache) {}
 
   const size_t ValueSize;
   OnDiskTrieRawHashMap Cache;
+  UnifiedOnDiskCache *UnifiedCache = nullptr;
 };
 
 } // namespace llvm::cas::ondisk
diff --git a/llvm/include/llvm/CAS/UnifiedOnDiskCache.h b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
new file mode 100644
index 0000000000000..6e0878a65fe72
--- /dev/null
+++ b/llvm/include/llvm/CAS/UnifiedOnDiskCache.h
@@ -0,0 +1,172 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CAS_UNIFIEDONDISKCACHE_H
+#define LLVM_CAS_UNIFIEDONDISKCACHE_H
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include <atomic>
+
+namespace llvm::cas::ondisk {
+
+class OnDiskKeyValueDB;
+
+/// A unified CAS nodes and key-value database, using on-disk storage for both.
+/// It manages storage growth and provides APIs for garbage collection.
+///
+/// High-level properties:
+/// * While \p UnifiedOnDiskCache is open on a directory, by any process, the
+///   storage size in that directory will keep growing unrestricted. For data to
+///   become eligible for garbage-collection there should be no open instances
+///   of \p UnifiedOnDiskCache for that directory, by any process.
+/// * Garbage-collection needs to be triggered explicitly by the client. It can
+///   be triggered on a directory concurrently, at any time and by any process,
+///   without affecting any active readers/writers, in the same process or other
+///   processes.
+///
+/// Usage patterns should be that an instance of \p UnifiedOnDiskCache is open
+/// for a limited period of time, e.g. for the duration of a build operation.
+/// For long-living processes that need periodic access to a
+/// \p UnifiedOnDiskCache, the client should devise a scheme where access is
+/// performed within some defined period. For example, if a service is designed
+/// to continuously wait for requests that access a \p UnifiedOnDiskCache, it
+/// could keep the instance alive while new requests are coming in but close it
+/// after a time period in which there are no new requests.
+class UnifiedOnDiskCache {
+public:
+  /// The \p OnDiskGraphDB instance for the open directory.
+  OnDiskGraphDB &getGraphDB() { return *PrimaryGraphDB; }
+
+  /// The \p OnDiskGraphDB instance for the open directory.
+  OnDiskKeyValueDB &getKeyValueDB() { return *PrimaryKVDB; }
+
+  /// Open a \p UnifiedOnDiskCache instance for a directory.
+  ///
+  /// \param Path directory for the on-disk database. The directory will be
+  /// created if it doesn't exist.
+  /// \param SizeLimit Optional size for limiting growth. This has an effect for
+  /// when the instance is closed.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param HashByteSize Size for the object digest hash bytes.
+  /// \param FaultInPolicy Controls how nodes are copied to primary store. This
+  /// is recorded at creation time and subsequent opens need to pass the same
+  /// policy otherwise the \p open will fail.
+  static Expected<std::unique_ptr<UnifiedOnDiskCache>>
+  open(StringRef Path, std::optional<uint64_t> SizeLimit, StringRef HashName,
+       unsigned HashByteSize,
+       OnDiskGraphDB::FaultInPolicy FaultInPolicy =
+           OnDiskGraphDB::FaultInPolicy::FullTree);
+
+  /// Validate the data in \p Path, if needed to ensure correctness.
+  ///
+  /// Note: if invalid data is detected and \p AllowRecovery is true, then
+  /// recovery requires exclusive access to the CAS and it is an error to
+  /// attempt recovery if there is concurrent use of the CAS.
+  ///
+  /// \param Path directory for the on-disk database.
+  /// \param HashName Identifier name for the hashing algorithm that is going to
+  /// be used.
+  /// \param HashByteSize Size for the object digest hash bytes.
+  /// \param CheckHash Whether to validate hashes match the data.
+  /// \param AllowRecovery Whether to automatically recover from invalid data by
+  /// marking the files for garbage collection.
+  /// \param ForceValidation Whether to force validation to occur even if it
+  /// should not be necessary.
+  /// \param LLVMCasBinary If provided, validation is performed out-of-process
+  /// using the given \c llvm-cas executable which protects against crashes
+  /// during validation. Otherwise validation is performed in-process.
+  ///
+  /// \returns \c Valid if the data is already valid, \c Recovered if data
+  /// was invalid but has been cleared, \c Skipped if validation is not needed,
+  /// or an \c Error if validation cannot be performed or if the data is left
+  /// in an invalid state because \p AllowRecovery is false.
+  static Expected<ValidationResult>
+  validateIfNeeded(StringRef Path, StringRef HashName, unsigned HashByteSize,
+                   bool CheckHash, bool AllowRecovery, bool ForceValidation,
+                   std::optional<StringRef> LLVMCasBinary);
+
+  /// This is called implicitly at destruction time, so it is not required for a
+  /// client to call this. After calling \p close the only method that is valid
+  /// to call is \p needsGarbageCollection.
+  ///
+  /// \param CheckSizeLimit if true it will check whether the primary store has
+  /// exceeded its intended size limit. If false the check is skipped even if a
+  /// \p SizeLimit was passed to the \p open call.
+  Error close(bool CheckSizeLimit = true);
+
+  /// Set the size for limiting growth. This has an effect for when the instance
+  /// is closed.
+  void setSizeLimit(std::optional<uint64_t> SizeLimit);
+
+  /// \returns the storage size of the cache data.
+  uint64_t getStorageSize() const;
+
+  /// \returns whether the primary store has exceeded the intended size limit.
+  /// This can return false even if the overall size of the opened directory is
+  /// over the \p SizeLimit passed to \p open. To know whether garbage
+  /// collection needs to be triggered or not, call \p needsGarbaseCollection.
+  bool hasExceededSizeLimit() const;
+
+  /// \returns whether there are unused data that can be deleted using a
+  /// \p collectGarbage call.
+  bool needsGarbageCollection() const { return NeedsGarbageCollection; }
+
+  /// Remove any unused data from the directory at \p Path. If there are no such
+  /// data the operation is a no-op.
+  ///
+  /// This can be called concurrently, regardless of whether there is an open
+  /// \p UnifiedOnDiskCache instance or not; it has no effect on readers/writers
+  /// in the same process or other processes.
+  ///
+  /// It is recommended that garbage-collection is triggered concurrently in the
+  /// background, so that it has minimal effect on the workload of the process.
+  static Error collectGarbage(StringRef Path);
+
+  /// Remove unused data from the current UnifiedOnDiskCache.
+  Error collectGarbage();
+
+  /// Helper function to convert the value stored in KeyValueDB and ObjectID.
+  static ObjectID getObjectIDFromValue(ArrayRef<char> Value);
+
+  using ValueBytes = std::array<char, sizeof(uint64_t)>;
+  static ValueBytes getValueFromObjectID(ObjectID ID);
+
+  ~UnifiedOnDiskCache();
+
+private:
+  friend class OnDiskGraphDB;
+  friend class OnDiskKeyValueDB;
+
+  UnifiedOnDiskCache();
+
+  Expected<std::optional<ArrayRef<char>>>
+  faultInFromUpstreamKV(ArrayRef<uint8_t> Key);
+
+  /// \returns the storage size of the primary directory.
+  uint64_t getPrimaryStorageSize() const;
+
+  std::string RootPath;
+  std::atomic<uint64_t> SizeLimit;
+
+  int LockFD = -1;
+
+  std::atomic<bool> NeedsGarbageCollection;
+  std::string PrimaryDBDir;
+
+  std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB;
+  std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+
+  std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+  std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+};
+
+} // namespace llvm::cas::ondisk
+
+#endif // LLVM_CAS_UNIFIEDONDISKCACHE_H
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index e8dbc964a943e..221d8f1e2f673 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -302,7 +302,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// (e.g. scalarization).
   std::optional<InstructionCost> getMultipleResultIntrinsicVectorLibCallCost(
       const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind,
-      RTLIB::Libcall LC,
       std::optional<unsigned> CallRetElementIndex = {}) const {
     Type *RetTy = ICA.getReturnType();
     // Vector variants of the intrinsic can be mapped to a vector library call.
@@ -311,12 +310,38 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         !isVectorizedStructTy(cast<StructType>(RetTy)))
       return std::nullopt;
 
+    Type *Ty = getContainedTypes(RetTy).front();
+    EVT VT = getTLI()->getValueType(DL, Ty);
+
+    EVT ScalarVT = VT.getScalarType();
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+
+    switch (ICA.getID()) {
+    case Intrinsic::modf:
+      LC = RTLIB::getMODF(ScalarVT);
+      break;
+    case Intrinsic::sincospi:
+      LC = RTLIB::getSINCOSPI(ScalarVT);
+      break;
+    case Intrinsic::sincos:
+      LC = RTLIB::getSINCOS(ScalarVT);
+      break;
+    default:
+      return std::nullopt;
+    }
+
     // Find associated libcall.
-    const char *LCName = getTLI()->getLibcallName(LC);
-    if (!LCName)
+    RTLIB::LibcallImpl LibcallImpl = getTLI()->getLibcallImpl(LC);
+    if (LibcallImpl == RTLIB::Unsupported)
       return std::nullopt;
 
+    StringRef LCName =
+        RTLIB::RuntimeLibcallsInfo::getLibcallImplName(LibcallImpl);
+
     // Search for a corresponding vector variant.
+    //
+    // FIXME: Should use RuntimeLibcallsInfo, not TargetLibraryInfo to get the
+    // vector mapping.
     LLVMContext &Ctx = RetTy->getContext();
     ElementCount VF = getVectorizedTypeVF(RetTy);
     VecDesc const *VD = nullptr;
@@ -2137,22 +2162,6 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::modf:
     case Intrinsic::sincos:
     case Intrinsic::sincospi: {
-      Type *Ty = getContainedTypes(RetTy).front();
-      EVT VT = getTLI()->getValueType(DL, Ty);
-
-      RTLIB::Libcall LC = [&] {
-        switch (ICA.getID()) {
-        case Intrinsic::modf:
-          return RTLIB::getMODF;
-        case Intrinsic::sincos:
-          return RTLIB::getSINCOS;
-        case Intrinsic::sincospi:
-          return RTLIB::getSINCOSPI;
-        default:
-          llvm_unreachable("unexpected intrinsic");
-        }
-      }()(VT.getScalarType());
-
       std::optional<unsigned> CallRetElementIndex;
       // The first element of the modf result is returned by value in the
       // libcall.
@@ -2160,7 +2169,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         CallRetElementIndex = 0;
 
       if (auto Cost = getMultipleResultIntrinsicVectorLibCallCost(
-              ICA, CostKind, LC, CallRetElementIndex))
+              ICA, CostKind, CallRetElementIndex))
         return *Cost;
       // Otherwise, fallback to default scalarization cost.
       break;
diff --git a/llvm/include/llvm/CodeGen/MachineScheduler.h b/llvm/include/llvm/CodeGen/MachineScheduler.h
index 6c5c27c9662e4..7b965d400ed08 100644
--- a/llvm/include/llvm/CodeGen/MachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/MachineScheduler.h
@@ -1038,7 +1038,7 @@ class SchedBoundary {
   getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
                        unsigned ReleaseAtCycle, unsigned AcquireAtCycle);
 
-  bool isUnbufferedGroup(unsigned PIdx) const {
+  bool isReservedGroup(unsigned PIdx) const {
     return SchedModel->getProcResource(PIdx)->SubUnitsIdxBegin &&
            !SchedModel->getProcResource(PIdx)->BufferSize;
   }
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index 1a5ffb38f2568..0dd4f23c6d85f 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1260,9 +1260,15 @@ class SelectionDAG {
   /// stack arguments from being clobbered.
   LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain);
 
-  std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl,
-                                        SDValue Dst, SDValue Src, SDValue Size,
-                                        const CallInst *CI);
+  /// Lower a memcmp operation into a target library call and return the
+  /// resulting chain and call result as SelectionDAG SDValues.
+  LLVM_ABI std::pair<SDValue, SDValue> getMemcmp(SDValue Chain, const SDLoc &dl,
+                                                 SDValue Dst, SDValue Src,
+                                                 SDValue Size,
+                                                 const CallInst *CI);
+
+  /// Lower a strlen operation into a target library call and return the
+  /// resulting chain and call result as SelectionDAG SDValues.
   LLVM_ABI std::pair<SDValue, SDValue>
   getStrlen(SDValue Chain, const SDLoc &dl, SDValue Src, const CallInst *CI);
 
diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h
index e7e87bbfebf38..b404c92e71836 100644
--- a/llvm/include/llvm/DebugInfo/DIContext.h
+++ b/llvm/include/llvm/DebugInfo/DIContext.h
@@ -211,6 +211,8 @@ struct DIDumpOptions {
   bool ShowAggregateErrors = false;
   bool PrintRegisterOnly = false;
   std::string JsonErrSummaryFile;
+  /// List of DWARF tags to filter children by.
+  llvm::SmallVector<unsigned, 0> FilterChildTag;
   std::function<llvm::StringRef(uint64_t DwarfRegNum, bool IsEH)>
       GetNameForDWARFReg;
 
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index 8ce2b1bea8fac..c086a39616249 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -183,6 +183,11 @@ def NoCallback : EnumAttr<"nocallback", IntersectAnd, [FnAttr]>;
 /// Specify how the pointer may be captured.
 def Captures : IntAttr<"captures", IntersectCustom, [ParamAttr]>;
 
+/// Result will not be undef or poison if all arguments are not undef and not
+/// poison.
+def NoCreateUndefOrPoison
+    : EnumAttr<"nocreateundeforpoison", IntersectAnd, [FnAttr]>;
+
 /// Function is not a source of divergence.
 def NoDivergenceSource : EnumAttr<"nodivergencesource", IntersectAnd, [FnAttr]>;
 
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 27930bbc651bd..8bd060ae8f485 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -3556,6 +3556,11 @@ class SwitchInstProfUpdateWrapper {
   /// correspondent branch weight.
   LLVM_ABI SwitchInst::CaseIt removeCase(SwitchInst::CaseIt I);
 
+  /// Replace the default destination by given case. Delegate the call to
+  /// the underlying SwitchInst::setDefaultDest and remove correspondent branch
+  /// weight.
+  LLVM_ABI void replaceDefaultDest(SwitchInst::CaseIt I);
+
   /// Delegate the call to the underlying SwitchInst::addCase() and set the
   /// specified branch weight for the added case.
   LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest, CaseWeightOpt W);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 4d59ee8676b9e..6a079f62dd9cf 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -186,6 +186,10 @@ def IntrSpeculatable : IntrinsicProperty;
 // defined by the hasSideEffects property of the TableGen Instruction class.
 def IntrHasSideEffects : IntrinsicProperty;
 
+// Result will not be undef or poison if all arguments are not undef and not
+// poison.
+def IntrNoCreateUndefOrPoison : IntrinsicProperty;
+
 //===----------------------------------------------------------------------===//
 // IIT constants and utils
 //===----------------------------------------------------------------------===//
@@ -1039,7 +1043,7 @@ def int_experimental_memset_pattern
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_fma  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>,
                             LLVMMatchType<0>]>;
@@ -1052,16 +1056,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   // environment so they can be treated as readnone.
   def int_sqrt : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_powi : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_anyint_ty]>;
-  def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_sin  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_cos  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_pow  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
                            [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_log  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
@@ -1080,12 +1076,6 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_nearbyint : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_round : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_roundeven    : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
-  def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
-  def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
-                             [llvm_anyfloat_ty]>;
 
   // Truncate a floating point number with a specific rounding mode
   def int_fptrunc_round : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
@@ -1097,6 +1087,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_arithmetic_fence : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>],
                                                    [IntrNoMem]>;
 
+  // If the value doesn't fit an unspecified value is returned, but this
+  // is not poison so we can still mark these as IntrNoCreateUndefOrPoison.
   def int_lround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
   def int_llround : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
   def int_lrint : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
@@ -1110,29 +1102,50 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
   def int_frexp : DefaultAttrsIntrinsic<[llvm_anyfloat_ty, llvm_anyint_ty], [LLVMMatchType<0>]>;
 }
 
+// TODO: Move all of these into the IntrNoCreateUndefOrPoison case above.
+let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+  // These functions do not read memory, but are sensitive to the
+  // rounding mode. LLVM purposely does not model changes to the FP
+  // environment so they can be treated as readnone.
+  def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_sinh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_cosh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_tanh : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_sincos : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+  def int_sincospi : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+  def int_modf : DefaultAttrsIntrinsic<[LLVMMatchType<0>, LLVMMatchType<0>],
+                             [llvm_anyfloat_ty]>;
+}
+
 def int_minnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maxnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_minimum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maximum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_minimumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 def int_maximumnum : DefaultAttrsIntrinsic<[llvm_anyfloat_ty],
   [LLVMMatchType<0>, LLVMMatchType<0>],
-  [IntrNoMem, IntrSpeculatable, Commutative]
+  [IntrNoMem, IntrSpeculatable, Commutative, IntrNoCreateUndefOrPoison]
 >;
 
 // Internal interface for object size checking
@@ -1164,7 +1177,7 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
 def int_is_fpclass
     : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                             [llvm_anyfloat_ty, llvm_i32_ty],
-                            [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
+                            [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, ImmArg<ArgIndex<1>>]>;
 
 //===--------------- Constrained Floating Point Intrinsics ----------------===//
 //
@@ -1406,7 +1419,7 @@ def int_expect_with_probability : DefaultAttrsIntrinsic<[llvm_anyint_ty],
 //
 
 // None of these intrinsics accesses memory at all.
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_bswap: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
   def int_ctpop: DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
   def int_bitreverse : DefaultAttrsIntrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
@@ -1521,7 +1534,7 @@ def int_adjust_trampoline : DefaultAttrsIntrinsic<
 //
 
 // Expose the carry flag from add operations on two integrals.
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
   def int_sadd_with_overflow : DefaultAttrsIntrinsic<[llvm_anyint_ty,
                                           LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
                                          [LLVMMatchType<0>, LLVMMatchType<0>]>;
@@ -1547,16 +1560,16 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
 //
 def int_sadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>;
 def int_uadd_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable, Commutative]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Commutative]>;
 def int_ssub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_usub_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
-                             [IntrNoMem, IntrSpeculatable]>;
+                             [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_sshl_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty],
                              [LLVMMatchType<0>, LLVMMatchType<0>],
                              [IntrNoMem, IntrSpeculatable]>;
@@ -1611,22 +1624,22 @@ def int_abs : DefaultAttrsIntrinsic<
 
 def int_smax : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_smin : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_umax : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_umin : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-    [IntrNoMem, IntrSpeculatable]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 def int_scmp : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
-    [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>;
 def int_ucmp : DefaultAttrsIntrinsic<
     [llvm_anyint_ty], [llvm_anyint_ty, LLVMMatchType<1>],
-    [IntrNoMem, IntrSpeculatable, Range<RetIndex, -1, 2>]>;
+    [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison, Range<RetIndex, -1, 2>]>;
 
 //===------------------------- Memory Use Markers -------------------------===//
 //
@@ -1868,7 +1881,7 @@ def int_convert_from_fp16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i16_
 }
 
 // Saturating floating point to integer intrinsics
-let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
+let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in {
 def int_fptoui_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
 def int_fptosi_sat : DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty]>;
 }
@@ -1890,7 +1903,7 @@ def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty],
 // First argument must be pointer or vector of pointer. This is checked by the
 // verifier.
 def int_ptrmask: DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_anyint_ty],
-                           [IntrNoMem, IntrSpeculatable]>;
+                           [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison]>;
 
 // Intrinsic to wrap a thread local variable.
 def int_threadlocal_address : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [LLVMMatchType<0>],
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index d6b85630eb979..9924b905aee63 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -140,6 +140,9 @@ def int_dx_isinf : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1
 def int_dx_isnan : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>],
     [llvm_anyfloat_ty], [IntrNoMem]>;
 
+def int_dx_legacyf16tof32 : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_float_ty>],
+    [llvm_anyint_ty], [IntrNoMem]>;
+
 def int_dx_lerp : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>,LLVMMatchType<0>],
     [IntrNoMem]>;
 
diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
index 84026aa9d3624..1c46965d995fe 100644
--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
@@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w
 def int_loongarch_lasx_xvstelm_d
   : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
+// LASX and LSX conversion
+def int_loongarch_lasx_cast_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 } // TargetPrefix = "loongarch"
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index bc51fb639fd75..f39c6cda2c579 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -200,4 +200,7 @@ def int_spv_resource_nonuniformindex
   def int_spv_generic_cast_to_ptr_explicit
     : DefaultAttrsIntrinsic<[llvm_anyptr_ty], [generic_ptr_ty],
        [IntrNoMem, NoUndef<RetIndex>]>;
+
+  def int_spv_unpackhalf2x16 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_i32_ty], [IntrNoMem]>;
+
 }
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.h b/llvm/include/llvm/IR/RuntimeLibcalls.h
index 01359894b0421..ab14ed44fed52 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.h
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.h
@@ -186,6 +186,13 @@ struct RuntimeLibcallsInfo {
     return RTLIB::Unsupported;
   }
 
+  /// \returns the function type and attributes for the \p LibcallImpl,
+  /// depending on the target \p TT. If the function has incomplete type
+  /// information, return nullptr for the function type.
+  std::pair<FunctionType *, AttributeList>
+  getFunctionTy(LLVMContext &Ctx, const Triple &TT, const DataLayout &DL,
+                RTLIB::LibcallImpl LibcallImpl) const;
+
 private:
   LLVM_ABI static iota_range<RTLIB::LibcallImpl>
   lookupLibcallImplNameImpl(StringRef Name);
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 01e7c6b07dd36..f4c1e30b097ee 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -447,7 +447,7 @@ class LLVM_ABI MachOObjectFile : public ObjectFile {
   uint64_t getSectionAddress(DataRefImpl Sec) const override;
   uint64_t getSectionIndex(DataRefImpl Sec) const override;
   uint64_t getSectionSize(DataRefImpl Sec) const override;
-  ArrayRef<uint8_t> getSectionContents(uint32_t Offset, uint64_t Size) const;
+  ArrayRef<uint8_t> getSectionContents(uint64_t Offset, uint64_t Size) const;
   Expected<ArrayRef<uint8_t>>
   getSectionContents(DataRefImpl Sec) const override;
   uint64_t getSectionAlignment(DataRefImpl Sec) const override;
diff --git a/llvm/include/llvm/Support/Allocator.h b/llvm/include/llvm/Support/Allocator.h
index bc0265904ef65..fffcbd9f3c1d8 100644
--- a/llvm/include/llvm/Support/Allocator.h
+++ b/llvm/include/llvm/Support/Allocator.h
@@ -380,7 +380,7 @@ class BumpPtrAllocatorImpl
 
 /// The standard BumpPtrAllocator which just uses the default template
 /// parameters.
-typedef BumpPtrAllocatorImpl<> BumpPtrAllocator;
+using BumpPtrAllocator = BumpPtrAllocatorImpl<>;
 
 /// A BumpPtrAllocator that allows only elements of a specific type to be
 /// allocated.
diff --git a/llvm/include/llvm/Support/Atomic.h b/llvm/include/llvm/Support/Atomic.h
index c2d9ae2da231c..3c62672a077f1 100644
--- a/llvm/include/llvm/Support/Atomic.h
+++ b/llvm/include/llvm/Support/Atomic.h
@@ -30,9 +30,9 @@ namespace llvm {
   LLVM_ABI void MemoryFence();
 
 #ifdef _MSC_VER
-  typedef long cas_flag;
+  using cas_flag = long;
 #else
-  typedef uint32_t cas_flag;
+  using cas_flag = uint32_t;
 #endif
   LLVM_ABI cas_flag CompareAndSwap(volatile cas_flag *ptr, cas_flag new_value,
                                    cas_flag old_value);
diff --git a/llvm/include/llvm/Support/BinaryStreamArray.h b/llvm/include/llvm/Support/BinaryStreamArray.h
index ef2233c53ec2c..a7d03f6511f12 100644
--- a/llvm/include/llvm/Support/BinaryStreamArray.h
+++ b/llvm/include/llvm/Support/BinaryStreamArray.h
@@ -93,7 +93,7 @@ class VarStreamArray {
   friend class VarStreamArrayIterator<ValueType, Extractor>;
 
 public:
-  typedef VarStreamArrayIterator<ValueType, Extractor> Iterator;
+  using Iterator = VarStreamArrayIterator<ValueType, Extractor>;
 
   VarStreamArray() = default;
 
@@ -156,8 +156,8 @@ template <typename ValueType, typename Extractor>
 class VarStreamArrayIterator
     : public iterator_facade_base<VarStreamArrayIterator<ValueType, Extractor>,
                                   std::forward_iterator_tag, const ValueType> {
-  typedef VarStreamArrayIterator<ValueType, Extractor> IterType;
-  typedef VarStreamArray<ValueType, Extractor> ArrayType;
+  using IterType = VarStreamArrayIterator<ValueType, Extractor>;
+  using ArrayType = VarStreamArray<ValueType, Extractor>;
 
 public:
   VarStreamArrayIterator(const ArrayType &Array, const Extractor &E,
@@ -260,7 +260,7 @@ template <typename T> class FixedStreamArray {
   friend class FixedStreamArrayIterator<T>;
 
 public:
-  typedef FixedStreamArrayIterator<T> Iterator;
+  using Iterator = FixedStreamArrayIterator<T>;
 
   FixedStreamArray() = default;
   explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) {
diff --git a/llvm/include/llvm/Support/Casting.h b/llvm/include/llvm/Support/Casting.h
index 6f6df2e9703ea..a6435a2562a2b 100644
--- a/llvm/include/llvm/Support/Casting.h
+++ b/llvm/include/llvm/Support/Casting.h
@@ -816,6 +816,42 @@ template <typename... Types> struct IsaAndPresentCheckPredicate {
     return isa_and_present<Types...>(Val);
   }
 };
+
+//===----------------------------------------------------------------------===//
+// Casting Function Objects
+//===----------------------------------------------------------------------===//
+
+/// Usable in generic algorithms like map_range
+template <typename U> struct StaticCastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return static_cast<U>(Val);
+  }
+};
+
+template <typename U> struct DynCastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return dyn_cast<U>(Val);
+  }
+};
+
+template <typename U> struct CastFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return cast<U>(Val);
+  }
+};
+
+template <typename U> struct CastIfPresentFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return cast_if_present<U>(Val);
+  }
+};
+
+template <typename U> struct DynCastIfPresentFunc {
+  template <typename T> decltype(auto) operator()(T &&Val) const {
+    return dyn_cast_if_present<U>(Val);
+  }
+};
+
 } // namespace detail
 
 /// Function object wrapper for the `llvm::isa` type check. The function call
@@ -841,6 +877,20 @@ template <typename... Types>
 inline constexpr detail::IsaAndPresentCheckPredicate<Types...>
     IsaAndPresentPred{};
 
+/// Function objects corresponding to the Cast types defined above.
+template <typename From>
+inline constexpr detail::StaticCastFunc<From> StaticCastTo{};
+
+template <typename From> inline constexpr detail::CastFunc<From> CastTo{};
+
+template <typename From>
+inline constexpr detail::CastIfPresentFunc<From> CastIfPresentTo{};
+
+template <typename From>
+inline constexpr detail::DynCastIfPresentFunc<From> DynCastIfPresentTo{};
+
+template <typename From> inline constexpr detail::DynCastFunc<From> DynCastTo{};
+
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_CASTING_H
diff --git a/llvm/include/llvm/Support/Chrono.h b/llvm/include/llvm/Support/Chrono.h
index 5b8102d8e11cf..e5f98249cc074 100644
--- a/llvm/include/llvm/Support/Chrono.h
+++ b/llvm/include/llvm/Support/Chrono.h
@@ -150,10 +150,10 @@ template <> struct unit<std::nano> {
 template <typename Rep, typename Period>
 struct format_provider<std::chrono::duration<Rep, Period>> {
 private:
-  typedef std::chrono::duration<Rep, Period> Dur;
-  typedef std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value,
-                             double, intmax_t>
-      InternalRep;
+  using Dur = std::chrono::duration<Rep, Period>;
+  using InternalRep =
+      std::conditional_t<std::chrono::treat_as_floating_point<Rep>::value,
+                         double, intmax_t>;
 
   template <typename AsPeriod> static InternalRep getAs(const Dur &D) {
     using namespace std::chrono;
diff --git a/llvm/include/llvm/Support/ConvertUTF.h b/llvm/include/llvm/Support/ConvertUTF.h
index bb1723518a490..ddf7057bff59d 100644
--- a/llvm/include/llvm/Support/ConvertUTF.h
+++ b/llvm/include/llvm/Support/ConvertUTF.h
@@ -126,10 +126,10 @@ namespace llvm {
     bit mask & shift operations.
 ------------------------------------------------------------------------ */
 
-typedef unsigned int    UTF32;  /* at least 32 bits */
-typedef unsigned short  UTF16;  /* at least 16 bits */
-typedef unsigned char   UTF8;   /* typically 8 bits */
-typedef unsigned char   Boolean; /* 0 or 1 */
+using UTF32 = unsigned int;    /* at least 32 bits */
+using UTF16 = unsigned short;  /* at least 16 bits */
+using UTF8 = unsigned char;    /* typically 8 bits */
+using Boolean = unsigned char; /* 0 or 1 */
 
 /* Some fundamental constants */
 #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
@@ -146,17 +146,14 @@ typedef unsigned char   Boolean; /* 0 or 1 */
 #define UNI_UTF32_BYTE_ORDER_MARK_NATIVE 0x0000FEFF
 #define UNI_UTF32_BYTE_ORDER_MARK_SWAPPED 0xFFFE0000
 
-typedef enum {
-  conversionOK,           /* conversion successful */
-  sourceExhausted,        /* partial character in source, but hit end */
-  targetExhausted,        /* insuff. room in target for conversion */
-  sourceIllegal           /* source sequence is illegal/malformed */
-} ConversionResult;
-
-typedef enum {
-  strictConversion = 0,
-  lenientConversion
-} ConversionFlags;
+enum ConversionResult {
+  conversionOK,    /* conversion successful */
+  sourceExhausted, /* partial character in source, but hit end */
+  targetExhausted, /* insuff. room in target for conversion */
+  sourceIllegal    /* source sequence is illegal/malformed */
+};
+
+enum ConversionFlags { strictConversion = 0, lenientConversion };
 
 LLVM_ABI ConversionResult ConvertUTF8toUTF16(const UTF8 **sourceStart,
                                              const UTF8 *sourceEnd,
diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h
index 39a08d499b67e..9904a0dd86559 100644
--- a/llvm/include/llvm/Support/DebugCounter.h
+++ b/llvm/include/llvm/Support/DebugCounter.h
@@ -140,7 +140,7 @@ class DebugCounter {
   }
 
   // Iterate through the registered counters
-  typedef UniqueVector<std::string> CounterVector;
+  using CounterVector = UniqueVector<std::string>;
   CounterVector::const_iterator begin() const {
     return RegisteredCounters.begin();
   }
diff --git a/llvm/include/llvm/Support/ErrorHandling.h b/llvm/include/llvm/Support/ErrorHandling.h
index 4c17b6e83acd2..a4fd008a9ff3f 100644
--- a/llvm/include/llvm/Support/ErrorHandling.h
+++ b/llvm/include/llvm/Support/ErrorHandling.h
@@ -21,8 +21,8 @@ class StringRef;
 class Twine;
 
 /// An error handler callback.
-typedef void (*fatal_error_handler_t)(void *user_data, const char *reason,
-                                      bool gen_crash_diag);
+using fatal_error_handler_t = void (*)(void *user_data, const char *reason,
+                                       bool gen_crash_diag);
 
 /// install_fatal_error_handler - Installs a new error handler to be used
 /// whenever a serious (non-recoverable) error is encountered by LLVM.
diff --git a/llvm/include/llvm/Support/FormatVariadicDetails.h b/llvm/include/llvm/Support/FormatVariadicDetails.h
index 0fdc7b6f94da7..c0b245e297a58 100644
--- a/llvm/include/llvm/Support/FormatVariadicDetails.h
+++ b/llvm/include/llvm/Support/FormatVariadicDetails.h
@@ -63,8 +63,8 @@ template <typename T> class missing_format_adapter;
 template <class T> class has_FormatProvider {
 public:
   using Decayed = std::decay_t<T>;
-  typedef void (*Signature_format)(const Decayed &, llvm::raw_ostream &,
-                                   StringRef);
+  using Signature_format = void (*)(const Decayed &, llvm::raw_ostream &,
+                                    StringRef);
 
   template <typename U> using check = SameType<Signature_format, &U::format>;
 
diff --git a/llvm/include/llvm/Support/GenericLoopInfo.h b/llvm/include/llvm/Support/GenericLoopInfo.h
index b6bb360d9868f..9e2f61fd03e78 100644
--- a/llvm/include/llvm/Support/GenericLoopInfo.h
+++ b/llvm/include/llvm/Support/GenericLoopInfo.h
@@ -150,9 +150,9 @@ template <class BlockT, class LoopT> class LoopBase {
     assert(!isInvalid() && "Loop not in a valid state!");
     return SubLoops;
   }
-  typedef typename std::vector<LoopT *>::const_iterator iterator;
-  typedef
-      typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator;
+  using iterator = typename std::vector<LoopT *>::const_iterator;
+  using reverse_iterator =
+      typename std::vector<LoopT *>::const_reverse_iterator;
   iterator begin() const { return getSubLoops().begin(); }
   iterator end() const { return getSubLoops().end(); }
   reverse_iterator rbegin() const { return getSubLoops().rbegin(); }
@@ -174,7 +174,7 @@ template <class BlockT, class LoopT> class LoopBase {
     assert(!isInvalid() && "Loop not in a valid state!");
     return Blocks;
   }
-  typedef typename ArrayRef<BlockT *>::const_iterator block_iterator;
+  using block_iterator = typename ArrayRef<BlockT *>::const_iterator;
   block_iterator block_begin() const { return getBlocks().begin(); }
   block_iterator block_end() const { return getBlocks().end(); }
   inline iterator_range<block_iterator> blocks() const {
@@ -302,7 +302,7 @@ template <class BlockT, class LoopT> class LoopBase {
   bool hasNoExitBlocks() const;
 
   /// Edge type.
-  typedef std::pair<BlockT *, BlockT *> Edge;
+  using Edge = std::pair<BlockT *, BlockT *>;
 
   /// Return all pairs of (_inside_block_,_outside_block_).
   void getExitEdges(SmallVectorImpl<Edge> &ExitEdges) const;
@@ -575,9 +575,9 @@ template <class BlockT, class LoopT> class LoopInfoBase {
   /// iterator/begin/end - The interface to the top-level loops in the current
   /// function.
   ///
-  typedef typename std::vector<LoopT *>::const_iterator iterator;
-  typedef
-      typename std::vector<LoopT *>::const_reverse_iterator reverse_iterator;
+  using iterator = typename std::vector<LoopT *>::const_iterator;
+  using reverse_iterator =
+      typename std::vector<LoopT *>::const_reverse_iterator;
   iterator begin() const { return TopLevelLoops.begin(); }
   iterator end() const { return TopLevelLoops.end(); }
   reverse_iterator rbegin() const { return TopLevelLoops.rbegin(); }
diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index 541678001a8ff..c830f0a67a448 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -459,7 +459,7 @@ template <class BlockT, class LoopT>
 static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges,
                                   LoopInfoBase<BlockT, LoopT> *LI,
                                   const DomTreeBase<BlockT> &DomTree) {
-  typedef GraphTraits<Inverse<BlockT *>> InvBlockTraits;
+  using InvBlockTraits = GraphTraits<Inverse<BlockT *>>;
 
   unsigned NumBlocks = 0;
   unsigned NumSubloops = 0;
@@ -513,8 +513,8 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges,
 
 /// Populate all loop data in a stable order during a single forward DFS.
 template <class BlockT, class LoopT> class PopulateLoopsDFS {
-  typedef GraphTraits<BlockT *> BlockTraits;
-  typedef typename BlockTraits::ChildIteratorType SuccIterTy;
+  using BlockTraits = GraphTraits<BlockT *>;
+  using SuccIterTy = typename BlockTraits::ChildIteratorType;
 
   LoopInfoBase<BlockT, LoopT> *LI;
 
diff --git a/llvm/include/llvm/Support/MD5.h b/llvm/include/llvm/Support/MD5.h
index 4ba386753f397..dbcb66d7680e4 100644
--- a/llvm/include/llvm/Support/MD5.h
+++ b/llvm/include/llvm/Support/MD5.h
@@ -90,7 +90,7 @@ class MD5 {
 
 private:
   // Any 32-bit or wider unsigned integer data type will do.
-  typedef uint32_t MD5_u32plus;
+  using MD5_u32plus = uint32_t;
 
   // Internal State
   struct {
diff --git a/llvm/include/llvm/Support/Mutex.h b/llvm/include/llvm/Support/Mutex.h
index d61e3fd96efbe..3ca5c9a2f6be8 100644
--- a/llvm/include/llvm/Support/Mutex.h
+++ b/llvm/include/llvm/Support/Mutex.h
@@ -63,12 +63,12 @@ namespace llvm
     };
 
     /// Mutex - A standard, always enforced mutex.
-    typedef SmartMutex<false> Mutex;
+    using Mutex = SmartMutex<false>;
 
     template <bool mt_only>
     using SmartScopedLock = std::lock_guard<SmartMutex<mt_only>>;
 
-    typedef SmartScopedLock<false> ScopedLock;
+    using ScopedLock = SmartScopedLock<false>;
   }
 }
 
diff --git a/llvm/include/llvm/Support/OnDiskHashTable.h b/llvm/include/llvm/Support/OnDiskHashTable.h
index d7d72cfbbc649..54c6b713478b9 100644
--- a/llvm/include/llvm/Support/OnDiskHashTable.h
+++ b/llvm/include/llvm/Support/OnDiskHashTable.h
@@ -69,7 +69,7 @@ template <typename Info> class OnDiskChainedHashTableGenerator {
         : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
   };
 
-  typedef typename Info::offset_type offset_type;
+  using offset_type = typename Info::offset_type;
   offset_type NumBuckets;
   offset_type NumEntries;
   llvm::SpecificBumpPtrAllocator<Item> BA;
@@ -278,12 +278,12 @@ template <typename Info> class OnDiskChainedHashTable {
   Info InfoObj;
 
 public:
-  typedef Info InfoType;
-  typedef typename Info::internal_key_type internal_key_type;
-  typedef typename Info::external_key_type external_key_type;
-  typedef typename Info::data_type data_type;
-  typedef typename Info::hash_value_type hash_value_type;
-  typedef typename Info::offset_type offset_type;
+  using InfoType = Info;
+  using internal_key_type = typename Info::internal_key_type;
+  using external_key_type = typename Info::external_key_type;
+  using data_type = typename Info::data_type;
+  using hash_value_type = typename Info::hash_value_type;
+  using offset_type = typename Info::offset_type;
 
   OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
                          const unsigned char *Buckets,
@@ -435,12 +435,12 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
   const unsigned char *Payload;
 
 public:
-  typedef OnDiskChainedHashTable<Info>          base_type;
-  typedef typename base_type::internal_key_type internal_key_type;
-  typedef typename base_type::external_key_type external_key_type;
-  typedef typename base_type::data_type         data_type;
-  typedef typename base_type::hash_value_type   hash_value_type;
-  typedef typename base_type::offset_type       offset_type;
+  using base_type = OnDiskChainedHashTable<Info>;
+  using internal_key_type = typename base_type::internal_key_type;
+  using external_key_type = typename base_type::external_key_type;
+  using data_type = typename base_type::data_type;
+  using hash_value_type = typename base_type::hash_value_type;
+  using offset_type = typename base_type::offset_type;
 
 private:
   /// Iterates over all of the keys in the table.
@@ -450,7 +450,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     offset_type NumEntriesLeft;
 
   public:
-    typedef external_key_type value_type;
+    using value_type = external_key_type;
 
     iterator_base(const unsigned char *const Ptr, offset_type NumEntries)
         : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries) {}
@@ -505,7 +505,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     Info *InfoObj;
 
   public:
-    typedef external_key_type value_type;
+    using value_type = external_key_type;
 
     key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
                  Info *InfoObj)
@@ -551,7 +551,7 @@ class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
     Info *InfoObj;
 
   public:
-    typedef data_type value_type;
+    using value_type = data_type;
 
     data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
                   Info *InfoObj)
diff --git a/llvm/include/llvm/Support/PointerLikeTypeTraits.h b/llvm/include/llvm/Support/PointerLikeTypeTraits.h
index 320f6b63b447e..a47d68406acf3 100644
--- a/llvm/include/llvm/Support/PointerLikeTypeTraits.h
+++ b/llvm/include/llvm/Support/PointerLikeTypeTraits.h
@@ -70,7 +70,7 @@ template <> struct PointerLikeTypeTraits<void *> {
 
 // Provide PointerLikeTypeTraits for const things.
 template <typename T> struct PointerLikeTypeTraits<const T> {
-  typedef PointerLikeTypeTraits<T> NonConst;
+  using NonConst = PointerLikeTypeTraits<T>;
 
   static inline const void *getAsVoidPointer(const T P) {
     return NonConst::getAsVoidPointer(P);
@@ -83,7 +83,7 @@ template <typename T> struct PointerLikeTypeTraits<const T> {
 
 // Provide PointerLikeTypeTraits for const pointers.
 template <typename T> struct PointerLikeTypeTraits<const T *> {
-  typedef PointerLikeTypeTraits<T *> NonConst;
+  using NonConst = PointerLikeTypeTraits<T *>;
 
   static inline const void *getAsVoidPointer(const T *P) {
     return NonConst::getAsVoidPointer(const_cast<T *>(P));
diff --git a/llvm/include/llvm/Support/Program.h b/llvm/include/llvm/Support/Program.h
index 53c2e7597b2b4..575e416587ea8 100644
--- a/llvm/include/llvm/Support/Program.h
+++ b/llvm/include/llvm/Support/Program.h
@@ -39,8 +39,8 @@ const char EnvPathSeparator = ';';
 typedef unsigned long procid_t; // Must match the type of DWORD on Windows.
 typedef void *process_t;        // Must match the type of HANDLE on Windows.
 #else
-typedef ::pid_t procid_t;
-typedef procid_t process_t;
+using procid_t = ::pid_t;
+using process_t = procid_t;
 #endif
 
 /// This struct encapsulates information about a process.
diff --git a/llvm/include/llvm/Support/RISCVISAUtils.h b/llvm/include/llvm/Support/RISCVISAUtils.h
index 165bb08d66431..05fd32e0e7cfe 100644
--- a/llvm/include/llvm/Support/RISCVISAUtils.h
+++ b/llvm/include/llvm/Support/RISCVISAUtils.h
@@ -40,8 +40,8 @@ struct ExtensionComparator {
 
 /// OrderedExtensionMap is std::map, it's specialized to keep entries
 /// in canonical order of extension.
-typedef std::map<std::string, ExtensionVersion, ExtensionComparator>
-    OrderedExtensionMap;
+using OrderedExtensionMap =
+    std::map<std::string, ExtensionVersion, ExtensionComparator>;
 
 } // namespace RISCVISAUtils
 
diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h
index 8d221aaab9ab9..efc1ca19a1208 100644
--- a/llvm/include/llvm/Support/RWMutex.h
+++ b/llvm/include/llvm/Support/RWMutex.h
@@ -162,7 +162,7 @@ template <bool mt_only> class SmartRWMutex {
   bool try_lock() { return impl.try_lock(); }
 };
 
-typedef SmartRWMutex<false> RWMutex;
+using RWMutex = SmartRWMutex<false>;
 
 /// ScopedReader - RAII acquisition of a reader lock
 #if !defined(LLVM_USE_RW_MUTEX_IMPL)
@@ -179,7 +179,7 @@ template <bool mt_only> struct SmartScopedReader {
   ~SmartScopedReader() { mutex.unlock_shared(); }
 };
 #endif
-typedef SmartScopedReader<false> ScopedReader;
+using ScopedReader = SmartScopedReader<false>;
 
 /// ScopedWriter - RAII acquisition of a writer lock
 #if !defined(LLVM_USE_RW_MUTEX_IMPL)
@@ -196,7 +196,7 @@ template <bool mt_only> struct SmartScopedWriter {
   ~SmartScopedWriter() { mutex.unlock(); }
 };
 #endif
-typedef SmartScopedWriter<false> ScopedWriter;
+using ScopedWriter = SmartScopedWriter<false>;
 
 } // end namespace sys
 } // end namespace llvm
diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h
index c02f15e5e32b8..acd3b06fde6e7 100644
--- a/llvm/include/llvm/Support/Registry.h
+++ b/llvm/include/llvm/Support/Registry.h
@@ -43,8 +43,8 @@ namespace llvm {
   template <typename T>
   class Registry {
   public:
-    typedef T type;
-    typedef SimpleRegistryEntry<T> entry;
+    using type = T;
+    using entry = SimpleRegistryEntry<T>;
 
     class node;
     class iterator;
diff --git a/llvm/include/llvm/Support/ScaledNumber.h b/llvm/include/llvm/Support/ScaledNumber.h
index 07baf153e10c6..8ca8d457e339e 100644
--- a/llvm/include/llvm/Support/ScaledNumber.h
+++ b/llvm/include/llvm/Support/ScaledNumber.h
@@ -498,10 +498,10 @@ template <class DigitsT> class ScaledNumber : ScaledNumberBase {
   static_assert(!std::numeric_limits<DigitsT>::is_signed,
                 "only unsigned floats supported");
 
-  typedef DigitsT DigitsType;
+  using DigitsType = DigitsT;
 
 private:
-  typedef std::numeric_limits<DigitsType> DigitsLimits;
+  using DigitsLimits = std::numeric_limits<DigitsType>;
 
   static constexpr int Width = sizeof(DigitsType) * 8;
   static_assert(Width <= 64, "invalid integer width for digits");
@@ -782,7 +782,7 @@ uint64_t ScaledNumber<DigitsT>::scale(uint64_t N) const {
 template <class DigitsT>
 template <class IntT>
 IntT ScaledNumber<DigitsT>::toInt() const {
-  typedef std::numeric_limits<IntT> Limits;
+  using Limits = std::numeric_limits<IntT>;
   if (*this < 1)
     return 0;
   if (*this >= Limits::max())
diff --git a/llvm/include/llvm/Support/SuffixTree.h b/llvm/include/llvm/Support/SuffixTree.h
index 4c78235abf508..eac66d84d6f63 100644
--- a/llvm/include/llvm/Support/SuffixTree.h
+++ b/llvm/include/llvm/Support/SuffixTree.h
@@ -219,7 +219,7 @@ class SuffixTree {
     }
   };
 
-  typedef RepeatedSubstringIterator iterator;
+  using iterator = RepeatedSubstringIterator;
   iterator begin() { return iterator(Root, LeafNodes); }
   iterator end() { return iterator(nullptr); }
 };
diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h
index 88846807f111a..89d90b3438e92 100644
--- a/llvm/include/llvm/Support/Threading.h
+++ b/llvm/include/llvm/Support/Threading.h
@@ -53,7 +53,7 @@ constexpr bool llvm_is_multithreaded() { return LLVM_ENABLE_THREADS; }
 
 #if LLVM_THREADING_USE_STD_CALL_ONCE
 
-  typedef std::once_flag once_flag;
+using once_flag = std::once_flag;
 
 #else
 
diff --git a/llvm/include/llvm/Support/TrailingObjects.h b/llvm/include/llvm/Support/TrailingObjects.h
index c47976524dcd9..218c2e336d77b 100644
--- a/llvm/include/llvm/Support/TrailingObjects.h
+++ b/llvm/include/llvm/Support/TrailingObjects.h
@@ -76,7 +76,7 @@ class TrailingObjectsBase {
 // number of a different type. e.g.:
 //   ExtractSecondType<Foo..., int>::type
 template <typename Ty1, typename Ty2> struct ExtractSecondType {
-  typedef Ty2 type;
+  using type = Ty2;
 };
 
 // TrailingObjectsImpl is somewhat complicated, because it is a
@@ -101,8 +101,8 @@ class TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, PrevTy, NextTy,
     : public TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy,
                                  MoreTys...> {
 
-  typedef TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>
-      ParentType;
+  using ParentType =
+      TrailingObjectsImpl<Align, BaseTy, TopTrailingObj, NextTy, MoreTys...>;
 
   struct RequiresRealignment {
     static const bool value = alignof(PrevTy) < alignof(NextTy);
diff --git a/llvm/include/llvm/Support/UnicodeCharRanges.h b/llvm/include/llvm/Support/UnicodeCharRanges.h
index 7f1a9b3ff0c3b..2b5fc83d34690 100644
--- a/llvm/include/llvm/Support/UnicodeCharRanges.h
+++ b/llvm/include/llvm/Support/UnicodeCharRanges.h
@@ -37,7 +37,7 @@ inline bool operator<(UnicodeCharRange Range, uint32_t Value) {
 /// array.
 class UnicodeCharSet {
 public:
-  typedef ArrayRef<UnicodeCharRange> CharRanges;
+  using CharRanges = ArrayRef<UnicodeCharRange>;
 
   /// Constructs a UnicodeCharSet instance from an array of
   /// UnicodeCharRanges.
diff --git a/llvm/include/llvm/Support/float128.h b/llvm/include/llvm/Support/float128.h
index e15a98dc5a677..ffad1241c3e3d 100644
--- a/llvm/include/llvm/Support/float128.h
+++ b/llvm/include/llvm/Support/float128.h
@@ -14,7 +14,7 @@ namespace llvm {
 #if defined(__clang__) && defined(__FLOAT128__) &&                             \
     defined(__SIZEOF_INT128__) && !defined(__LONG_DOUBLE_IBM128__)
 #define HAS_IEE754_FLOAT128
-typedef __float128 float128;
+using float128 = __float128;
 #elif defined(__FLOAT128__) && defined(__SIZEOF_INT128__) &&                   \
     !defined(__LONG_DOUBLE_IBM128__) &&                                        \
     (defined(__GNUC__) || defined(__GNUG__))
diff --git a/llvm/include/llvm/Support/thread.h b/llvm/include/llvm/Support/thread.h
index 16e322bfd8785..ecde62d8368e7 100644
--- a/llvm/include/llvm/Support/thread.h
+++ b/llvm/include/llvm/Support/thread.h
@@ -127,7 +127,7 @@ LLVM_ABI thread::id llvm_thread_get_current_id_impl();
 template <class Function, class... Args>
 thread::thread(std::optional<unsigned> StackSizeInBytes, Function &&f,
                Args &&...args) {
-  typedef std::tuple<std::decay_t<Function>, std::decay_t<Args>...> CalleeTuple;
+  using CalleeTuple = std::tuple<std::decay_t<Function>, std::decay_t<Args>...>;
   std::unique_ptr<CalleeTuple> Callee(
       new CalleeTuple(std::forward<Function>(f), std::forward<Args>(args)...));
 
diff --git a/llvm/include/llvm/TableGen/CodeGenHelpers.h b/llvm/include/llvm/TableGen/CodeGenHelpers.h
index e22c6d4f6d390..95866e306b5ff 100644
--- a/llvm/include/llvm/TableGen/CodeGenHelpers.h
+++ b/llvm/include/llvm/TableGen/CodeGenHelpers.h
@@ -20,6 +20,7 @@
 #include <string>
 
 namespace llvm {
+
 // Simple RAII helper for emitting ifdef-undef-endif scope.
 class IfDefEmitter {
 public:
@@ -57,7 +58,7 @@ class NamespaceEmitter {
   NamespaceEmitter(raw_ostream &OS, StringRef NameUntrimmed)
       : Name(trim(NameUntrimmed).str()), OS(OS) {
     if (!Name.empty())
-      OS << "namespace " << Name << " {\n";
+      OS << "namespace " << Name << " {\n\n";
   }
 
   ~NamespaceEmitter() { close(); }
@@ -65,7 +66,7 @@ class NamespaceEmitter {
   // Explicit function to close the namespace scopes.
   void close() {
     if (!Closed && !Name.empty())
-      OS << "} // namespace " << Name << "\n";
+      OS << "\n} // namespace " << Name << "\n";
     Closed = true;
   }
 
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index a013f27766051..eb35e3644bd02 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -5339,6 +5339,19 @@ struct AAPotentialConstantValues
     return nullptr;
   }
 
+  /// Return the minimum trailing zeros of potential constants
+  unsigned getAssumedMinTrailingZeros() const {
+    if (!isValidState() || getAssumedSet().empty())
+      return 0;
+    unsigned TrailingZeros = getAssumedSet().begin()->getBitWidth() + 1;
+    for (const APInt &It : getAssumedSet()) {
+      if (It.countTrailingZeros() < TrailingZeros)
+        TrailingZeros = It.countTrailingZeros();
+    }
+    if (TrailingZeros > getAssumedSet().begin()->getBitWidth())
+      return 0;
+    return TrailingZeros;
+  }
   /// See AbstractAttribute::getName()
   StringRef getName() const override { return "AAPotentialConstantValues"; }
 
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index e27a9b1c44014..5d88e5f54e3d6 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -806,11 +806,11 @@ class AccessAnalysis {
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
   AccessAnalysis(const Loop *TheLoop, AAResults *AA, const LoopInfo *LI,
-                 MemoryDepChecker::DepCandidates &DA,
+                 DominatorTree &DT, MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE,
                  SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
-      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
-        LoopAliasScopes(LoopAliasScopes) {
+      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DT(DT), DepCands(DA),
+        PSE(PSE), LoopAliasScopes(LoopAliasScopes) {
     // We're analyzing dependences across loop iterations.
     BAA.enableCrossIterationMode();
   }
@@ -934,6 +934,9 @@ class AccessAnalysis {
   /// The LoopInfo of the loop being checked.
   const LoopInfo *LI;
 
+  /// The dominator tree of the function.
+  DominatorTree &DT;
+
   /// Sets of potentially dependent accesses - members of one set share an
   /// underlying pointer. The set "CheckDeps" identfies which sets really need a
   /// dependence check.
@@ -1015,6 +1018,7 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy,
 /// informating from the IR pointer value to determine no-wrap.
 static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
                      Value *Ptr, Type *AccessTy, const Loop *L, bool Assume,
+                     const DominatorTree &DT,
                      std::optional<int64_t> Stride = std::nullopt) {
   // FIXME: This should probably only return true for NUW.
   if (AR->getNoWrapFlags(SCEV::NoWrapMask))
@@ -1029,8 +1033,18 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR,
   // case, the GEP would be  poison and any memory access dependent on it would
   // be immediate UB when executed.
   if (auto *GEP = dyn_cast_if_present<GetElementPtrInst>(Ptr);
-      GEP && GEP->hasNoUnsignedSignedWrap())
-    return true;
+      GEP && GEP->hasNoUnsignedSignedWrap()) {
+    // For the above reasoning to apply, the pointer must be dereferenced in
+    // every iteration.
+    if (L->getHeader() == L->getLoopLatch() ||
+        any_of(GEP->users(), [L, &DT, GEP](User *U) {
+          if (getLoadStorePointerOperand(U) != GEP)
+            return false;
+          BasicBlock *UserBB = cast<Instruction>(U)->getParent();
+          return !LoopAccessInfo::blockNeedsPredication(UserBB, L, &DT);
+        }))
+      return true;
+  }
 
   if (!Stride)
     Stride = getStrideFromAddRec(AR, L, AccessTy, Ptr, PSE);
@@ -1293,7 +1307,7 @@ bool AccessAnalysis::createCheckForAccess(
     }
 
     if (!isNoWrap(PSE, AR, RTCheckPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
-                  TheLoop, Assume))
+                  TheLoop, Assume, DT))
       return false;
   }
 
@@ -1606,7 +1620,7 @@ void AccessAnalysis::processMemAccesses() {
 /// Check whether the access through \p Ptr has a constant stride.
 std::optional<int64_t>
 llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
-                   const Loop *Lp,
+                   const Loop *Lp, const DominatorTree &DT,
                    const DenseMap<Value *, const SCEV *> &StridesMap,
                    bool Assume, bool ShouldCheckWrap) {
   const SCEV *PtrScev = replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr);
@@ -1630,7 +1644,7 @@ llvm::getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr,
   if (!ShouldCheckWrap || !Stride)
     return Stride;
 
-  if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, Stride))
+  if (isNoWrap(PSE, AR, Ptr, AccessTy, Lp, Assume, DT, Stride))
     return Stride;
 
   LLVM_DEBUG(
@@ -2047,10 +2061,10 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
       BPtr->getType()->getPointerAddressSpace())
     return MemoryDepChecker::Dependence::Unknown;
 
-  std::optional<int64_t> StrideAPtr =
-      getPtrStride(PSE, ATy, APtr, InnermostLoop, SymbolicStrides, true, true);
-  std::optional<int64_t> StrideBPtr =
-      getPtrStride(PSE, BTy, BPtr, InnermostLoop, SymbolicStrides, true, true);
+  std::optional<int64_t> StrideAPtr = getPtrStride(
+      PSE, ATy, APtr, InnermostLoop, *DT, SymbolicStrides, true, true);
+  std::optional<int64_t> StrideBPtr = getPtrStride(
+      PSE, BTy, BPtr, InnermostLoop, *DT, SymbolicStrides, true, true);
 
   const SCEV *Src = PSE.getSCEV(APtr);
   const SCEV *Sink = PSE.getSCEV(BPtr);
@@ -2627,7 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
   }
 
   MemoryDepChecker::DepCandidates DepCands;
-  AccessAnalysis Accesses(TheLoop, AA, LI, DepCands, *PSE, LoopAliasScopes);
+  AccessAnalysis Accesses(TheLoop, AA, LI, *DT, DepCands, *PSE,
+                          LoopAliasScopes);
 
   // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -2691,7 +2706,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
     bool IsReadOnlyPtr = false;
     Type *AccessTy = getLoadStoreType(LD);
     if (Seen.insert({Ptr, AccessTy}).second ||
-        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, SymbolicStrides)) {
+        !getPtrStride(*PSE, AccessTy, Ptr, TheLoop, *DT, SymbolicStrides, false,
+                      true)) {
       ++NumReads;
       IsReadOnlyPtr = true;
     }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 0a72076f51824..789a98366cead 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -7419,84 +7419,20 @@ static bool canCreateUndefOrPoison(const Operator *Op, UndefPoisonKind Kind,
         if (cast<ConstantInt>(II->getArgOperand(1))->isNullValue())
           return false;
         break;
-      case Intrinsic::ctpop:
-      case Intrinsic::bswap:
-      case Intrinsic::bitreverse:
-      case Intrinsic::fshl:
-      case Intrinsic::fshr:
-      case Intrinsic::smax:
-      case Intrinsic::smin:
-      case Intrinsic::scmp:
-      case Intrinsic::umax:
-      case Intrinsic::umin:
-      case Intrinsic::ucmp:
-      case Intrinsic::ptrmask:
-      case Intrinsic::fptoui_sat:
-      case Intrinsic::fptosi_sat:
-      case Intrinsic::sadd_with_overflow:
-      case Intrinsic::ssub_with_overflow:
-      case Intrinsic::smul_with_overflow:
-      case Intrinsic::uadd_with_overflow:
-      case Intrinsic::usub_with_overflow:
-      case Intrinsic::umul_with_overflow:
-      case Intrinsic::sadd_sat:
-      case Intrinsic::uadd_sat:
-      case Intrinsic::ssub_sat:
-      case Intrinsic::usub_sat:
-        return false;
       case Intrinsic::sshl_sat:
       case Intrinsic::ushl_sat:
-        return includesPoison(Kind) &&
-               !shiftAmountKnownInRange(II->getArgOperand(1));
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-      case Intrinsic::sqrt:
-      case Intrinsic::powi:
-      case Intrinsic::sin:
-      case Intrinsic::cos:
-      case Intrinsic::pow:
-      case Intrinsic::log:
-      case Intrinsic::log10:
-      case Intrinsic::log2:
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::exp10:
-      case Intrinsic::fabs:
-      case Intrinsic::copysign:
-      case Intrinsic::floor:
-      case Intrinsic::ceil:
-      case Intrinsic::trunc:
-      case Intrinsic::rint:
-      case Intrinsic::nearbyint:
-      case Intrinsic::round:
-      case Intrinsic::roundeven:
-      case Intrinsic::fptrunc_round:
-      case Intrinsic::canonicalize:
-      case Intrinsic::arithmetic_fence:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-      case Intrinsic::minimum:
-      case Intrinsic::maximum:
-      case Intrinsic::minimumnum:
-      case Intrinsic::maximumnum:
-      case Intrinsic::is_fpclass:
-      case Intrinsic::ldexp:
-      case Intrinsic::frexp:
-        return false;
-      case Intrinsic::lround:
-      case Intrinsic::llround:
-      case Intrinsic::lrint:
-      case Intrinsic::llrint:
-        // If the value doesn't fit an unspecified value is returned (but this
-        // is not poison).
-        return false;
+        if (!includesPoison(Kind) ||
+            shiftAmountKnownInRange(II->getArgOperand(1)))
+          return false;
+        break;
       }
     }
     [[fallthrough]];
   case Instruction::CallBr:
   case Instruction::Invoke: {
     const auto *CB = cast<CallBase>(Op);
-    return !CB->hasRetAttr(Attribute::NoUndef);
+    return !CB->hasRetAttr(Attribute::NoUndef) &&
+           !CB->hasFnAttr(Attribute::NoCreateUndefOrPoison);
   }
   case Instruction::InsertElement:
   case Instruction::ExtractElement: {
@@ -10405,3 +10341,55 @@ const Value *llvm::stripNullTest(const Value *V) {
 Value *llvm::stripNullTest(Value *V) {
   return const_cast<Value *>(stripNullTest(const_cast<const Value *>(V)));
 }
+
+bool llvm::collectPossibleValues(const Value *V,
+                                 SmallPtrSetImpl<const Constant *> &Constants,
+                                 unsigned MaxCount, bool AllowUndefOrPoison) {
+  SmallPtrSet<const Instruction *, 8> Visited;
+  SmallVector<const Instruction *, 8> Worklist;
+  auto Push = [&](const Value *V) -> bool {
+    if (auto *C = dyn_cast<Constant>(V)) {
+      if (!AllowUndefOrPoison && !isGuaranteedNotToBeUndefOrPoison(C))
+        return false;
+      // Check existence first to avoid unnecessary allocations.
+      if (Constants.contains(C))
+        return true;
+      if (Constants.size() == MaxCount)
+        return false;
+      Constants.insert(C);
+      return true;
+    }
+
+    if (auto *Inst = dyn_cast<Instruction>(V)) {
+      if (Visited.insert(Inst).second)
+        Worklist.push_back(Inst);
+      return true;
+    }
+    return false;
+  };
+  if (!Push(V))
+    return false;
+  while (!Worklist.empty()) {
+    const Instruction *CurInst = Worklist.pop_back_val();
+    switch (CurInst->getOpcode()) {
+    case Instruction::Select:
+      if (!Push(CurInst->getOperand(1)))
+        return false;
+      if (!Push(CurInst->getOperand(2)))
+        return false;
+      break;
+    case Instruction::PHI:
+      for (Value *IncomingValue : cast<PHINode>(CurInst)->incoming_values()) {
+        // Fast path for recurrence PHI.
+        if (IncomingValue == CurInst)
+          continue;
+        if (!Push(IncomingValue))
+          return false;
+      }
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 091d94843698c..977ed59e09243 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1387,9 +1387,9 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       // wrap around the address space we would do a memory access at nullptr
       // even without the transformation. The wrapping checks are therefore
       // deferred until after we've formed the interleaved groups.
-      int64_t Stride =
-        getPtrStride(PSE, ElementTy, Ptr, TheLoop, Strides,
-                     /*Assume=*/true, /*ShouldCheckWrap=*/false).value_or(0);
+      int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides,
+                                    /*Assume=*/true, /*ShouldCheckWrap=*/false)
+                           .value_or(0);
 
       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
       AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
@@ -1643,8 +1643,9 @@ void InterleavedAccessInfo::analyzeInterleaving(
     assert(Member && "Group member does not exist");
     Value *MemberPtr = getLoadStorePointerOperand(Member);
     Type *AccessTy = getLoadStoreType(Member);
-    if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides,
-                     /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0))
+    if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
+                     /*Assume=*/false, /*ShouldCheckWrap=*/true)
+            .value_or(0))
       return false;
     LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                       << FirstOrLast
diff --git a/llvm/lib/BinaryFormat/CMakeLists.txt b/llvm/lib/BinaryFormat/CMakeLists.txt
index 4b2debb7ae236..0c8af1e7a4565 100644
--- a/llvm/lib/BinaryFormat/CMakeLists.txt
+++ b/llvm/lib/BinaryFormat/CMakeLists.txt
@@ -6,7 +6,6 @@ add_llvm_component_library(LLVMBinaryFormat
   ELF.cpp
   MachO.cpp
   Magic.cpp
-  Minidump.cpp
   MsgPackDocument.cpp
   MsgPackDocumentYAML.cpp
   MsgPackReader.cpp
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 466dcb02696f4..8930d64de5e37 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2257,6 +2257,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Captures;
   case bitc::ATTR_KIND_DEAD_ON_RETURN:
     return Attribute::DeadOnReturn;
+  case bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON:
+    return Attribute::NoCreateUndefOrPoison;
   }
 }
 
@@ -8566,16 +8568,13 @@ Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
 }
 
 static Expected<std::pair<bool, bool>>
-getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
-                                                 unsigned ID,
-                                                 BitcodeLTOInfo &LTOInfo) {
+getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream, unsigned ID) {
   if (Error Err = Stream.EnterSubBlock(ID))
     return std::move(Err);
-  SmallVector<uint64_t, 64> Record;
 
+  SmallVector<uint64_t, 64> Record;
   while (true) {
     BitstreamEntry Entry;
-    std::pair<bool, bool> Result = {false,false};
     if (Error E = Stream.advanceSkippingSubblocks().moveInto(Entry))
       return std::move(E);
 
@@ -8584,8 +8583,8 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock: {
-      // If no flags record found, set both flags to false.
-      return Result;
+      // If no flags record found, return both flags as false.
+      return std::make_pair(false, false);
     }
     case BitstreamEntry::Record:
       // The interesting case.
@@ -8607,9 +8606,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
 
       bool EnableSplitLTOUnit = Flags & 0x8;
       bool UnifiedLTO = Flags & 0x200;
-      Result = {EnableSplitLTOUnit, UnifiedLTO};
-
-      return Result;
+      return std::make_pair(EnableSplitLTOUnit, UnifiedLTO);
     }
     }
   }
@@ -8638,26 +8635,15 @@ Expected<BitcodeLTOInfo> BitcodeModule::getLTOInfo() {
                             /*EnableSplitLTOUnit=*/false, /*UnifiedLTO=*/false};
 
     case BitstreamEntry::SubBlock:
-      if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID) {
-        BitcodeLTOInfo LTOInfo;
+      if (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID ||
+          Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) {
         Expected<std::pair<bool, bool>> Flags =
-            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo);
+            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID);
         if (!Flags)
           return Flags.takeError();
-        std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get();
-        LTOInfo.IsThinLTO = true;
-        LTOInfo.HasSummary = true;
-        return LTOInfo;
-      }
-
-      if (Entry.ID == bitc::FULL_LTO_GLOBALVAL_SUMMARY_BLOCK_ID) {
         BitcodeLTOInfo LTOInfo;
-        Expected<std::pair<bool, bool>> Flags =
-            getEnableSplitLTOUnitAndUnifiedFlag(Stream, Entry.ID, LTOInfo);
-        if (!Flags)
-          return Flags.takeError();
         std::tie(LTOInfo.EnableSplitLTOUnit, LTOInfo.UnifiedLTO) = Flags.get();
-        LTOInfo.IsThinLTO = false;
+        LTOInfo.IsThinLTO = (Entry.ID == bitc::GLOBALVAL_SUMMARY_BLOCK_ID);
         LTOInfo.HasSummary = true;
         return LTOInfo;
       }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index f17656c7c3b03..76494c792ac7b 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -956,6 +956,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_CAPTURES;
   case Attribute::DeadOnReturn:
     return bitc::ATTR_KIND_DEAD_ON_RETURN;
+  case Attribute::NoCreateUndefOrPoison:
+    return bitc::ATTR_KIND_NO_CREATE_UNDEF_OR_POISON;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/CAS/ActionCaches.cpp b/llvm/lib/CAS/ActionCaches.cpp
index 571c5b3ca5b4b..003c850275ff4 100644
--- a/llvm/lib/CAS/ActionCaches.cpp
+++ b/llvm/lib/CAS/ActionCaches.cpp
@@ -13,7 +13,11 @@
 #include "BuiltinCAS.h"
 #include "llvm/ADT/TrieRawHashMap.h"
 #include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/BLAKE3.h"
+#include "llvm/Support/Errc.h"
 
 #define DEBUG_TYPE "cas-action-caches"
 
@@ -47,12 +51,54 @@ class InMemoryActionCache final : public ActionCache {
   Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
                                          bool CanBeDistributed) const final;
 
+  Error validate() const final {
+    return createStringError("InMemoryActionCache doesn't support validate()");
+  }
+
 private:
   using DataT = CacheEntry<sizeof(HashType)>;
   using InMemoryCacheT = ThreadSafeTrieRawHashMap<DataT, sizeof(HashType)>;
 
   InMemoryCacheT Cache;
 };
+
+/// Builtin basic OnDiskActionCache that uses one underlying OnDiskKeyValueDB.
+class OnDiskActionCache final : public ActionCache {
+public:
+  Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+                bool CanBeDistributed) final;
+  Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+                                         bool CanBeDistributed) const final;
+
+  static Expected<std::unique_ptr<OnDiskActionCache>> create(StringRef Path);
+
+  Error validate() const final;
+
+private:
+  static StringRef getHashName() { return "BLAKE3"; }
+
+  OnDiskActionCache(std::unique_ptr<ondisk::OnDiskKeyValueDB> DB);
+
+  std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+  using DataT = CacheEntry<sizeof(HashType)>;
+};
+
+/// Builtin unified ActionCache that wraps around UnifiedOnDiskCache to provide
+/// access to its ActionCache.
+class UnifiedOnDiskActionCache final : public ActionCache {
+public:
+  Error putImpl(ArrayRef<uint8_t> ActionKey, const CASID &Result,
+                bool CanBeDistributed) final;
+  Expected<std::optional<CASID>> getImpl(ArrayRef<uint8_t> ActionKey,
+                                         bool CanBeDistributed) const final;
+
+  UnifiedOnDiskActionCache(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+  Error validate() const final;
+
+private:
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+};
 } // end namespace
 
 static Error createResultCachePoisonedError(ArrayRef<uint8_t> KeyHash,
@@ -99,3 +145,123 @@ std::unique_ptr<ActionCache> createInMemoryActionCache() {
 }
 
 } // namespace llvm::cas
+
+OnDiskActionCache::OnDiskActionCache(
+    std::unique_ptr<ondisk::OnDiskKeyValueDB> DB)
+    : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+      DB(std::move(DB)) {}
+
+Expected<std::unique_ptr<OnDiskActionCache>>
+OnDiskActionCache::create(StringRef AbsPath) {
+  std::unique_ptr<ondisk::OnDiskKeyValueDB> DB;
+  if (Error E = ondisk::OnDiskKeyValueDB::open(AbsPath, getHashName(),
+                                               sizeof(HashType), getHashName(),
+                                               sizeof(DataT))
+                    .moveInto(DB))
+    return std::move(E);
+  return std::unique_ptr<OnDiskActionCache>(
+      new OnDiskActionCache(std::move(DB)));
+}
+
+Expected<std::optional<CASID>>
+OnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+                           bool /*CanBeDistributed*/) const {
+  std::optional<ArrayRef<char>> Val;
+  if (Error E = DB->get(Key).moveInto(Val))
+    return std::move(E);
+  if (!Val)
+    return std::nullopt;
+  return CASID::create(&getContext(), toStringRef(*Val));
+}
+
+Error OnDiskActionCache::putImpl(ArrayRef<uint8_t> Key, const CASID &Result,
+                                 bool /*CanBeDistributed*/) {
+  auto ResultHash = Result.getHash();
+  ArrayRef Expected((const char *)ResultHash.data(), ResultHash.size());
+  ArrayRef<char> Observed;
+  if (Error E = DB->put(Key, Expected).moveInto(Observed))
+    return E;
+
+  if (Expected == Observed)
+    return Error::success();
+
+  return createResultCachePoisonedError(
+      Key, getContext(), Result,
+      ArrayRef((const uint8_t *)Observed.data(), Observed.size()));
+}
+
+Error OnDiskActionCache::validate() const {
+  // FIXME: without the matching CAS there is nothing we can check about the
+  // cached values. The hash size is already validated by the DB validator.
+  return DB->validate(nullptr);
+}
+
+UnifiedOnDiskActionCache::UnifiedOnDiskActionCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+    : ActionCache(builtin::BuiltinCASContext::getDefaultContext()),
+      UniDB(std::move(UniDB)) {}
+
+Expected<std::optional<CASID>>
+UnifiedOnDiskActionCache::getImpl(ArrayRef<uint8_t> Key,
+                                  bool /*CanBeDistributed*/) const {
+  std::optional<ArrayRef<char>> Val;
+  if (Error E = UniDB->getKeyValueDB().get(Key).moveInto(Val))
+    return std::move(E);
+  if (!Val)
+    return std::nullopt;
+  auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Val);
+  return CASID::create(&getContext(),
+                       toStringRef(UniDB->getGraphDB().getDigest(ID)));
+}
+
+Error UnifiedOnDiskActionCache::putImpl(ArrayRef<uint8_t> Key,
+                                        const CASID &Result,
+                                        bool /*CanBeDistributed*/) {
+  auto Expected = UniDB->getGraphDB().getReference(Result.getHash());
+  if (LLVM_UNLIKELY(!Expected))
+    return Expected.takeError();
+
+  auto Value = ondisk::UnifiedOnDiskCache::getValueFromObjectID(*Expected);
+  std::optional<ArrayRef<char>> Observed;
+  if (Error E = UniDB->getKeyValueDB().put(Key, Value).moveInto(Observed))
+    return E;
+
+  auto ObservedID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(*Observed);
+  if (*Expected == ObservedID)
+    return Error::success();
+
+  return createResultCachePoisonedError(
+      Key, getContext(), Result, UniDB->getGraphDB().getDigest(ObservedID));
+}
+
+Error UnifiedOnDiskActionCache::validate() const {
+  auto ValidateRef = [](FileOffset Offset, ArrayRef<char> Value) -> Error {
+    auto ID = ondisk::UnifiedOnDiskCache::getObjectIDFromValue(Value);
+    auto formatError = [&](Twine Msg) {
+      return createStringError(
+          llvm::errc::illegal_byte_sequence,
+          "bad record at 0x" +
+              utohexstr((unsigned)Offset.get(), /*LowerCase=*/true) + ": " +
+              Msg.str());
+    };
+    if (ID.getOpaqueData() == 0)
+      return formatError("zero is not a valid ref");
+    return Error::success();
+  };
+  return UniDB->getKeyValueDB().validate(ValidateRef);
+}
+
+Expected<std::unique_ptr<ActionCache>>
+cas::createOnDiskActionCache(StringRef Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return OnDiskActionCache::create(Path);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
+
+std::unique_ptr<ActionCache>
+cas::builtin::createActionCacheFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+  return std::make_unique<UnifiedOnDiskActionCache>(std::move(UniDB));
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.cpp b/llvm/lib/CAS/BuiltinCAS.cpp
index 73646ad2c3528..e9bc6d8beed4e 100644
--- a/llvm/lib/CAS/BuiltinCAS.cpp
+++ b/llvm/lib/CAS/BuiltinCAS.cpp
@@ -9,6 +9,7 @@
 #include "BuiltinCAS.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/Process.h"
 
 using namespace llvm;
@@ -68,7 +69,7 @@ Expected<ObjectRef> BuiltinCAS::store(ArrayRef<ObjectRef> Refs,
                    Refs, Data);
 }
 
-Error BuiltinCAS::validate(const CASID &ID) {
+Error BuiltinCAS::validateObject(const CASID &ID) {
   auto Ref = getReference(ID);
   if (!Ref)
     return createUnknownObjectError(ID);
@@ -92,3 +93,14 @@ Error BuiltinCAS::validate(const CASID &ID) {
 
   return Error::success();
 }
+
+Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>>
+cas::builtin::createBuiltinUnifiedOnDiskCache(StringRef Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return ondisk::UnifiedOnDiskCache::open(Path, /*SizeLimit=*/std::nullopt,
+                                          BuiltinCASContext::getHashName(),
+                                          sizeof(HashType));
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
diff --git a/llvm/lib/CAS/BuiltinCAS.h b/llvm/lib/CAS/BuiltinCAS.h
index 3b5374d5e1850..4d2de66cf636f 100644
--- a/llvm/lib/CAS/BuiltinCAS.h
+++ b/llvm/lib/CAS/BuiltinCAS.h
@@ -1,4 +1,4 @@
-//===- BuiltinCAS.h ---------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,6 +15,9 @@
 
 namespace llvm::cas {
 class ActionCache;
+namespace ondisk {
+class UnifiedOnDiskCache;
+} // namespace ondisk
 namespace builtin {
 
 /// Common base class for builtin CAS implementations using the same CASContext.
@@ -65,9 +68,27 @@ class BuiltinCAS : public ObjectStore {
                              "corrupt storage");
   }
 
-  Error validate(const CASID &ID) final;
+  Error validateObject(const CASID &ID) final;
 };
 
+/// Create a \p UnifiedOnDiskCache instance that uses \p BLAKE3 hashing.
+Expected<std::unique_ptr<ondisk::UnifiedOnDiskCache>>
+createBuiltinUnifiedOnDiskCache(StringRef Path);
+
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ObjectStore> createObjectStoreFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+/// \param UniDB A \p UnifiedOnDiskCache instance from \p
+/// createBuiltinUnifiedOnDiskCache.
+std::unique_ptr<ActionCache> createActionCacheFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB);
+
+// FIXME: Proxy not portable. Maybe also error-prone?
+constexpr StringLiteral DefaultDirProxy = "/^llvm::cas::builtin::default";
+constexpr StringLiteral DefaultDir = "llvm.cas.builtin.default";
+
 } // end namespace builtin
 } // end namespace llvm::cas
 
diff --git a/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
new file mode 100644
index 0000000000000..f3f6fa043bc52
--- /dev/null
+++ b/llvm/lib/CAS/BuiltinUnifiedCASDatabases.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "BuiltinCAS.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+Expected<std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>>>
+cas::createOnDiskUnifiedCASDatabases(StringRef Path) {
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB;
+  if (Error E = builtin::createBuiltinUnifiedOnDiskCache(Path).moveInto(UniDB))
+    return std::move(E);
+  auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+  auto AC = builtin::createActionCacheFromUnifiedOnDiskCache(std::move(UniDB));
+  return std::make_pair(std::move(CAS), std::move(AC));
+}
+
+Expected<ValidationResult> cas::validateOnDiskUnifiedCASDatabasesIfNeeded(
+    StringRef Path, bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinary) {
+#if LLVM_ENABLE_ONDISK_CAS
+  return ondisk::UnifiedOnDiskCache::validateIfNeeded(
+      Path, builtin::BuiltinCASContext::getHashName(),
+      sizeof(builtin::HashType), CheckHash, AllowRecovery, ForceValidation,
+      LLVMCasBinary);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCache is disabled");
+#endif
+}
diff --git a/llvm/lib/CAS/CMakeLists.txt b/llvm/lib/CAS/CMakeLists.txt
index a2f8c49e50145..aad77dce370d8 100644
--- a/llvm/lib/CAS/CMakeLists.txt
+++ b/llvm/lib/CAS/CMakeLists.txt
@@ -2,15 +2,18 @@ add_llvm_component_library(LLVMCAS
   ActionCache.cpp
   ActionCaches.cpp
   BuiltinCAS.cpp
+  BuiltinUnifiedCASDatabases.cpp
   DatabaseFile.cpp
   InMemoryCAS.cpp
   MappedFileRegionArena.cpp
   ObjectStore.cpp
+  OnDiskCAS.cpp
   OnDiskCommon.cpp
   OnDiskDataAllocator.cpp
   OnDiskGraphDB.cpp
   OnDiskKeyValueDB.cpp
   OnDiskTrieRawHashMap.cpp
+  UnifiedOnDiskCache.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CAS
diff --git a/llvm/lib/CAS/InMemoryCAS.cpp b/llvm/lib/CAS/InMemoryCAS.cpp
index c63ee70de0849..2d4eedd5bdc8f 100644
--- a/llvm/lib/CAS/InMemoryCAS.cpp
+++ b/llvm/lib/CAS/InMemoryCAS.cpp
@@ -233,6 +233,12 @@ class InMemoryCAS : public BuiltinCAS {
     return cast<InMemoryObject>(asInMemoryObject(Node)).getData();
   }
 
+  void print(raw_ostream &OS) const final;
+
+  Error validate(bool CheckHash) const final {
+    return createStringError("InMemoryCAS doesn't support validate()");
+  }
+
   InMemoryCAS() = default;
 
 private:
@@ -271,6 +277,8 @@ ArrayRef<const InMemoryObject *> InMemoryObject::getRefs() const {
   return cast<InMemoryInlineObject>(this)->getRefsImpl();
 }
 
+void InMemoryCAS::print(raw_ostream &OS) const {}
+
 Expected<ObjectRef>
 InMemoryCAS::storeFromNullTerminatedRegion(ArrayRef<uint8_t> ComputedHash,
                                            sys::fs::mapped_file_region Map) {
diff --git a/llvm/lib/CAS/ObjectStore.cpp b/llvm/lib/CAS/ObjectStore.cpp
index e0be50bbe013a..3110577e03774 100644
--- a/llvm/lib/CAS/ObjectStore.cpp
+++ b/llvm/lib/CAS/ObjectStore.cpp
@@ -1,4 +1,4 @@
-//===- ObjectStore.cpp ------------------------------------------*- C++ -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,7 +12,7 @@
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include <optional>
+#include <deque>
 
 using namespace llvm;
 using namespace llvm::cas;
@@ -21,6 +21,7 @@ void CASContext::anchor() {}
 void ObjectStore::anchor() {}
 
 LLVM_DUMP_METHOD void CASID::dump() const { print(dbgs()); }
+LLVM_DUMP_METHOD void ObjectStore::dump() const { print(dbgs()); }
 LLVM_DUMP_METHOD void ObjectRef::dump() const { print(dbgs()); }
 LLVM_DUMP_METHOD void ObjectHandle::dump() const { print(dbgs()); }
 
@@ -141,7 +142,7 @@ Error ObjectStore::validateTree(ObjectRef Root) {
     auto [I, Inserted] = ValidatedRefs.insert(Ref);
     if (!Inserted)
       continue; // already validated.
-    if (Error E = validate(getID(Ref)))
+    if (Error E = validateObject(getID(Ref)))
       return E;
     Expected<ObjectHandle> Obj = load(Ref);
     if (!Obj)
@@ -155,6 +156,92 @@ Error ObjectStore::validateTree(ObjectRef Root) {
   return Error::success();
 }
 
+Expected<ObjectRef> ObjectStore::importObject(ObjectStore &Upstream,
+                                              ObjectRef Other) {
+  // Copy the full CAS tree from upstream with depth-first ordering to ensure
+  // all the child nodes are available in downstream CAS before inserting
+  // current object. This uses a similar algorithm as
+  // `OnDiskGraphDB::importFullTree` but doesn't assume the upstream CAS schema
+  // so it can be used to import from any other ObjectStore reguardless of the
+  // CAS schema.
+
+  // There is no work to do if importing from self.
+  if (this == &Upstream)
+    return Other;
+
+  /// Keeps track of the state of visitation for current node and all of its
+  /// parents. Upstream Cursor holds information only from upstream CAS.
+  struct UpstreamCursor {
+    ObjectRef Ref;
+    ObjectHandle Node;
+    size_t RefsCount;
+    std::deque<ObjectRef> Refs;
+  };
+  SmallVector<UpstreamCursor, 16> CursorStack;
+  /// PrimaryNodeStack holds the ObjectRef of the current CAS, with nodes either
+  /// just stored in the CAS or nodes already exists in the current CAS.
+  SmallVector<ObjectRef, 128> PrimaryRefStack;
+  /// A map from upstream ObjectRef to current ObjectRef.
+  llvm::DenseMap<ObjectRef, ObjectRef> CreatedObjects;
+
+  auto enqueueNode = [&](ObjectRef Ref, ObjectHandle Node) {
+    unsigned NumRefs = Upstream.getNumRefs(Node);
+    std::deque<ObjectRef> Refs;
+    for (unsigned I = 0; I < NumRefs; ++I)
+      Refs.push_back(Upstream.readRef(Node, I));
+
+    CursorStack.push_back({Ref, Node, NumRefs, std::move(Refs)});
+  };
+
+  auto UpstreamHandle = Upstream.load(Other);
+  if (!UpstreamHandle)
+    return UpstreamHandle.takeError();
+  enqueueNode(Other, *UpstreamHandle);
+
+  while (!CursorStack.empty()) {
+    UpstreamCursor &Cur = CursorStack.back();
+    if (Cur.Refs.empty()) {
+      // Copy the node data into the primary store.
+      // The bottom of \p PrimaryRefStack contains the ObjectRef for the
+      // current node.
+      assert(PrimaryRefStack.size() >= Cur.RefsCount);
+      auto Refs = ArrayRef(PrimaryRefStack)
+                      .slice(PrimaryRefStack.size() - Cur.RefsCount);
+      auto NewNode = store(Refs, Upstream.getData(Cur.Node));
+      if (!NewNode)
+        return NewNode.takeError();
+
+      // Remove the current node and its IDs from the stack.
+      PrimaryRefStack.truncate(PrimaryRefStack.size() - Cur.RefsCount);
+      CursorStack.pop_back();
+
+      PrimaryRefStack.push_back(*NewNode);
+      CreatedObjects.try_emplace(Cur.Ref, *NewNode);
+      continue;
+    }
+
+    // Check if the node exists already.
+    auto CurrentID = Cur.Refs.front();
+    Cur.Refs.pop_front();
+    auto Ref = CreatedObjects.find(CurrentID);
+    if (Ref != CreatedObjects.end()) {
+      // If exists already, just need to enqueue the primary node.
+      PrimaryRefStack.push_back(Ref->second);
+      continue;
+    }
+
+    // Load child.
+    auto PrimaryID = Upstream.load(CurrentID);
+    if (LLVM_UNLIKELY(!PrimaryID))
+      return PrimaryID.takeError();
+
+    enqueueNode(CurrentID, *PrimaryID);
+  }
+
+  assert(PrimaryRefStack.size() == 1);
+  return PrimaryRefStack.front();
+}
+
 std::unique_ptr<MemoryBuffer>
 ObjectProxy::getMemoryBuffer(StringRef Name,
                              bool RequiresNullTerminator) const {
diff --git a/llvm/lib/CAS/OnDiskCAS.cpp b/llvm/lib/CAS/OnDiskCAS.cpp
new file mode 100644
index 0000000000000..7d29f4499211e
--- /dev/null
+++ b/llvm/lib/CAS/OnDiskCAS.cpp
@@ -0,0 +1,211 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BuiltinCAS.h"
+#include "llvm/CAS/BuiltinCASContext.h"
+#include "llvm/CAS/BuiltinObjectHasher.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Error.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::builtin;
+
+namespace {
+
+class OnDiskCAS : public BuiltinCAS {
+public:
+  Expected<ObjectRef> storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                ArrayRef<ObjectRef> Refs,
+                                ArrayRef<char> Data) final;
+
+  Expected<std::optional<ObjectHandle>> loadIfExists(ObjectRef Ref) final;
+
+  CASID getID(ObjectRef Ref) const final;
+
+  std::optional<ObjectRef> getReference(const CASID &ID) const final;
+
+  Expected<bool> isMaterialized(ObjectRef Ref) const final;
+
+  ArrayRef<char> getDataConst(ObjectHandle Node) const final;
+
+  void print(raw_ostream &OS) const final;
+  Error validate(bool CheckHash) const final;
+
+  static Expected<std::unique_ptr<OnDiskCAS>> open(StringRef Path);
+
+  OnDiskCAS(std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB)
+      : UnifiedDB(std::move(UniDB)), DB(&UnifiedDB->getGraphDB()) {}
+
+private:
+  ObjectHandle convertHandle(ondisk::ObjectHandle Node) const {
+    return makeObjectHandle(Node.getOpaqueData());
+  }
+
+  ondisk::ObjectHandle convertHandle(ObjectHandle Node) const {
+    return ondisk::ObjectHandle(Node.getInternalRef(*this));
+  }
+
+  ObjectRef convertRef(ondisk::ObjectID Ref) const {
+    return makeObjectRef(Ref.getOpaqueData());
+  }
+
+  ondisk::ObjectID convertRef(ObjectRef Ref) const {
+    return ondisk::ObjectID::fromOpaqueData(Ref.getInternalRef(*this));
+  }
+
+  size_t getNumRefs(ObjectHandle Node) const final {
+    auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+    return std::distance(RefsRange.begin(), RefsRange.end());
+  }
+
+  ObjectRef readRef(ObjectHandle Node, size_t I) const final {
+    auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+    return convertRef(RefsRange.begin()[I]);
+  }
+
+  Error forEachRef(ObjectHandle Node,
+                   function_ref<Error(ObjectRef)> Callback) const final;
+
+  Error setSizeLimit(std::optional<uint64_t> SizeLimit) final;
+  Expected<std::optional<uint64_t>> getStorageSize() const final;
+  Error pruneStorageData() final;
+
+  OnDiskCAS(std::unique_ptr<ondisk::OnDiskGraphDB> GraphDB)
+      : OwnedDB(std::move(GraphDB)), DB(OwnedDB.get()) {}
+
+  std::unique_ptr<ondisk::OnDiskGraphDB> OwnedDB;
+  std::shared_ptr<ondisk::UnifiedOnDiskCache> UnifiedDB;
+  ondisk::OnDiskGraphDB *DB;
+};
+
+} // end anonymous namespace
+
+void OnDiskCAS::print(raw_ostream &OS) const { DB->print(OS); }
+Error OnDiskCAS::validate(bool CheckHash) const {
+  auto Hasher = [](ArrayRef<ArrayRef<uint8_t>> Refs, ArrayRef<char> Data,
+                   SmallVectorImpl<uint8_t> &Result) {
+    auto Hash = BuiltinObjectHasher<llvm::cas::builtin::HasherT>::hashObject(
+        Refs, Data);
+    Result.assign(Hash.begin(), Hash.end());
+  };
+
+  if (auto E = DB->validate(CheckHash, Hasher))
+    return E;
+
+  return Error::success();
+}
+
+CASID OnDiskCAS::getID(ObjectRef Ref) const {
+  ArrayRef<uint8_t> Hash = DB->getDigest(convertRef(Ref));
+  return CASID::create(&getContext(), toStringRef(Hash));
+}
+
+std::optional<ObjectRef> OnDiskCAS::getReference(const CASID &ID) const {
+  std::optional<ondisk::ObjectID> ObjID =
+      DB->getExistingReference(ID.getHash());
+  if (!ObjID)
+    return std::nullopt;
+  return convertRef(*ObjID);
+}
+
+Expected<bool> OnDiskCAS::isMaterialized(ObjectRef ExternalRef) const {
+  return DB->isMaterialized(convertRef(ExternalRef));
+}
+
+ArrayRef<char> OnDiskCAS::getDataConst(ObjectHandle Node) const {
+  return DB->getObjectData(convertHandle(Node));
+}
+
+Expected<std::optional<ObjectHandle>>
+OnDiskCAS::loadIfExists(ObjectRef ExternalRef) {
+  Expected<std::optional<ondisk::ObjectHandle>> ObjHnd =
+      DB->load(convertRef(ExternalRef));
+  if (!ObjHnd)
+    return ObjHnd.takeError();
+  if (!*ObjHnd)
+    return std::nullopt;
+  return convertHandle(**ObjHnd);
+}
+
+Expected<ObjectRef> OnDiskCAS::storeImpl(ArrayRef<uint8_t> ComputedHash,
+                                         ArrayRef<ObjectRef> Refs,
+                                         ArrayRef<char> Data) {
+  SmallVector<ondisk::ObjectID, 64> IDs;
+  IDs.reserve(Refs.size());
+  for (ObjectRef Ref : Refs) {
+    IDs.push_back(convertRef(Ref));
+  }
+
+  auto StoredID = DB->getReference(ComputedHash);
+  if (LLVM_UNLIKELY(!StoredID))
+    return StoredID.takeError();
+  if (Error E = DB->store(*StoredID, IDs, Data))
+    return std::move(E);
+  return convertRef(*StoredID);
+}
+
+Error OnDiskCAS::forEachRef(ObjectHandle Node,
+                            function_ref<Error(ObjectRef)> Callback) const {
+  auto RefsRange = DB->getObjectRefs(convertHandle(Node));
+  for (ondisk::ObjectID Ref : RefsRange) {
+    if (Error E = Callback(convertRef(Ref)))
+      return E;
+  }
+  return Error::success();
+}
+
+Error OnDiskCAS::setSizeLimit(std::optional<uint64_t> SizeLimit) {
+  UnifiedDB->setSizeLimit(SizeLimit);
+  return Error::success();
+}
+
+Expected<std::optional<uint64_t>> OnDiskCAS::getStorageSize() const {
+  return UnifiedDB->getStorageSize();
+}
+
+Error OnDiskCAS::pruneStorageData() { return UnifiedDB->collectGarbage(); }
+
+Expected<std::unique_ptr<OnDiskCAS>> OnDiskCAS::open(StringRef AbsPath) {
+  Expected<std::unique_ptr<ondisk::OnDiskGraphDB>> DB =
+      ondisk::OnDiskGraphDB::open(AbsPath, BuiltinCASContext::getHashName(),
+                                  sizeof(HashType));
+  if (!DB)
+    return DB.takeError();
+  return std::unique_ptr<OnDiskCAS>(new OnDiskCAS(std::move(*DB)));
+}
+
+bool cas::isOnDiskCASEnabled() {
+#if LLVM_ENABLE_ONDISK_CAS
+  return true;
+#else
+  return false;
+#endif
+}
+
+Expected<std::unique_ptr<ObjectStore>> cas::createOnDiskCAS(const Twine &Path) {
+#if LLVM_ENABLE_ONDISK_CAS
+  // FIXME: An absolute path isn't really good enough. Should open a directory
+  // and use openat() for files underneath.
+  SmallString<256> AbsPath;
+  Path.toVector(AbsPath);
+  sys::fs::make_absolute(AbsPath);
+
+  return OnDiskCAS::open(AbsPath);
+#else
+  return createStringError(inconvertibleErrorCode(), "OnDiskCAS is disabled");
+#endif /* LLVM_ENABLE_ONDISK_CAS */
+}
+
+std::unique_ptr<ObjectStore>
+cas::builtin::createObjectStoreFromUnifiedOnDiskCache(
+    std::shared_ptr<ondisk::UnifiedOnDiskCache> UniDB) {
+  return std::make_unique<OnDiskCAS>(std::move(UniDB));
+}
diff --git a/llvm/lib/CAS/OnDiskGraphDB.cpp b/llvm/lib/CAS/OnDiskGraphDB.cpp
index 64cbe9dc8e159..245b6fb832549 100644
--- a/llvm/lib/CAS/OnDiskGraphDB.cpp
+++ b/llvm/lib/CAS/OnDiskGraphDB.cpp
@@ -893,6 +893,10 @@ int64_t DataRecordHandle::getDataRelOffset() const {
 }
 
 Error OnDiskGraphDB::validate(bool Deep, HashingFuncT Hasher) const {
+  if (UpstreamDB) {
+    if (auto E = UpstreamDB->validate(Deep, Hasher))
+      return E;
+  }
   return Index.validate([&](FileOffset Offset,
                             OnDiskTrieRawHashMap::ConstValueProxy Record)
                             -> Error {
@@ -1202,11 +1206,8 @@ OnDiskGraphDB::load(ObjectID ExternalRef) {
     return I.takeError();
   TrieRecord::Data Object = I->Ref.load();
 
-  if (Object.SK == TrieRecord::StorageKind::Unknown) {
-    if (!UpstreamDB)
-      return std::nullopt;
+  if (Object.SK == TrieRecord::StorageKind::Unknown)
     return faultInFromUpstream(ExternalRef);
-  }
 
   if (Object.SK == TrieRecord::StorageKind::DataPool)
     return ObjectHandle::fromFileOffset(Object.Offset);
@@ -1286,8 +1287,10 @@ OnDiskGraphDB::getObjectPresence(ObjectID ExternalRef,
   TrieRecord::Data Object = I->Ref.load();
   if (Object.SK != TrieRecord::StorageKind::Unknown)
     return ObjectPresence::InPrimaryDB;
+
   if (!CheckUpstream || !UpstreamDB)
     return ObjectPresence::Missing;
+
   std::optional<ObjectID> UpstreamID =
       UpstreamDB->getExistingReference(getDigest(*I));
   return UpstreamID.has_value() ? ObjectPresence::OnlyInUpstreamDB
@@ -1549,9 +1552,10 @@ unsigned OnDiskGraphDB::getHardStorageLimitUtilization() const {
   return std::max(IndexPercent, DataPercent);
 }
 
-Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
-    StringRef AbsPath, StringRef HashName, unsigned HashByteSize,
-    std::unique_ptr<OnDiskGraphDB> UpstreamDB, FaultInPolicy Policy) {
+Expected<std::unique_ptr<OnDiskGraphDB>>
+OnDiskGraphDB::open(StringRef AbsPath, StringRef HashName,
+                    unsigned HashByteSize, OnDiskGraphDB *UpstreamDB,
+                    FaultInPolicy Policy) {
   if (std::error_code EC = sys::fs::create_directories(AbsPath))
     return createFileError(AbsPath, EC);
 
@@ -1604,18 +1608,15 @@ Expected<std::unique_ptr<OnDiskGraphDB>> OnDiskGraphDB::open(
                              "unexpected user header in '" + DataPoolPath +
                                  "'");
 
-  return std::unique_ptr<OnDiskGraphDB>(
-      new OnDiskGraphDB(AbsPath, std::move(*Index), std::move(*DataPool),
-                        std::move(UpstreamDB), Policy));
+  return std::unique_ptr<OnDiskGraphDB>(new OnDiskGraphDB(
+      AbsPath, std::move(*Index), std::move(*DataPool), UpstreamDB, Policy));
 }
 
 OnDiskGraphDB::OnDiskGraphDB(StringRef RootPath, OnDiskTrieRawHashMap Index,
                              OnDiskDataAllocator DataPool,
-                             std::unique_ptr<OnDiskGraphDB> UpstreamDB,
-                             FaultInPolicy Policy)
+                             OnDiskGraphDB *UpstreamDB, FaultInPolicy Policy)
     : Index(std::move(Index)), DataPool(std::move(DataPool)),
-      RootPath(RootPath.str()), UpstreamDB(std::move(UpstreamDB)),
-      FIPolicy(Policy) {
+      RootPath(RootPath.str()), UpstreamDB(UpstreamDB), FIPolicy(Policy) {
   /// Lifetime for "big" objects not in DataPool.
   ///
   /// NOTE: Could use ThreadSafeTrieRawHashMap here. For now, doing something
@@ -1638,7 +1639,6 @@ Error OnDiskGraphDB::importFullTree(ObjectID PrimaryID,
   // against the process dying during importing and leaving the database with an
   // incomplete tree. Note that if the upstream has missing nodes then the tree
   // will be copied with missing nodes as well, it won't be considered an error.
-
   struct UpstreamCursor {
     ObjectHandle Node;
     size_t RefsCount;
@@ -1720,7 +1720,6 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
   // Copy the node data into the primary store.
   // FIXME: Use hard-link or cloning if the file-system supports it and data is
   // stored into a separate file.
-
   auto Data = UpstreamDB->getObjectData(UpstreamNode);
   auto UpstreamRefs = UpstreamDB->getObjectRefs(UpstreamNode);
   SmallVector<ObjectID, 64> Refs;
@@ -1737,7 +1736,8 @@ Error OnDiskGraphDB::importSingleNode(ObjectID PrimaryID,
 
 Expected<std::optional<ObjectHandle>>
 OnDiskGraphDB::faultInFromUpstream(ObjectID PrimaryID) {
-  assert(UpstreamDB);
+  if (!UpstreamDB)
+    return std::nullopt;
 
   auto UpstreamID = UpstreamDB->getReference(getDigest(PrimaryID));
   if (LLVM_UNLIKELY(!UpstreamID))
diff --git a/llvm/lib/CAS/OnDiskKeyValueDB.cpp b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
index 21860717da3bf..15656cb38a5e5 100644
--- a/llvm/lib/CAS/OnDiskKeyValueDB.cpp
+++ b/llvm/lib/CAS/OnDiskKeyValueDB.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CAS/OnDiskKeyValueDB.h"
 #include "OnDiskCommon.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Errc.h"
@@ -53,15 +54,21 @@ Expected<std::optional<ArrayRef<char>>>
 OnDiskKeyValueDB::get(ArrayRef<uint8_t> Key) {
   // Check the result cache.
   OnDiskTrieRawHashMap::ConstOnDiskPtr ActionP = Cache.find(Key);
-  if (!ActionP)
+  if (ActionP) {
+    assert(isAddrAligned(Align(8), ActionP->Data.data()));
+    return ActionP->Data;
+  }
+  if (!UnifiedCache || !UnifiedCache->UpstreamKVDB)
     return std::nullopt;
-  assert(isAddrAligned(Align(8), ActionP->Data.data()));
-  return ActionP->Data;
+
+  // Try to fault in from upstream.
+  return UnifiedCache->faultInFromUpstreamKV(Key);
 }
 
 Expected<std::unique_ptr<OnDiskKeyValueDB>>
 OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
-                       StringRef ValueName, size_t ValueSize) {
+                       StringRef ValueName, size_t ValueSize,
+                       UnifiedOnDiskCache *Cache) {
   if (std::error_code EC = sys::fs::create_directories(Path))
     return createFileError(Path, EC);
 
@@ -87,10 +94,14 @@ OnDiskKeyValueDB::open(StringRef Path, StringRef HashName, unsigned KeySize,
     return std::move(E);
 
   return std::unique_ptr<OnDiskKeyValueDB>(
-      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache)));
+      new OnDiskKeyValueDB(ValueSize, std::move(*ActionCache), Cache));
 }
 
 Error OnDiskKeyValueDB::validate(CheckValueT CheckValue) const {
+  if (UnifiedCache && UnifiedCache->UpstreamKVDB) {
+    if (auto E = UnifiedCache->UpstreamKVDB->validate(CheckValue))
+      return E;
+  }
   return Cache.validate(
       [&](FileOffset Offset,
           OnDiskTrieRawHashMap::ConstValueProxy Record) -> Error {
diff --git a/llvm/lib/CAS/UnifiedOnDiskCache.cpp b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
new file mode 100644
index 0000000000000..ae9d818241f4b
--- /dev/null
+++ b/llvm/lib/CAS/UnifiedOnDiskCache.cpp
@@ -0,0 +1,613 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Encapsulates \p OnDiskGraphDB and \p OnDiskKeyValueDB instances within one
+/// directory while also restricting storage growth with a scheme of chaining
+/// the two most recent directories (primary & upstream), where the primary
+/// "faults-in" data from the upstream one. When the primary (most recent)
+/// directory exceeds its intended limit a new empty directory becomes the
+/// primary one.
+///
+/// Within the top-level directory (the path that \p UnifiedOnDiskCache::open
+/// receives) there are directories named like this:
+///
+/// 'v<version>.<x>'
+/// 'v<version>.<x+1>'
+/// 'v<version>.<x+2>'
+/// ...
+///
+/// 'version' is the version integer for this \p UnifiedOnDiskCache's scheme and
+/// the part after the dot is an increasing integer. The primary directory is
+/// the one with the highest integer and the upstream one is the directory
+/// before it. For example, if the sub-directories contained are:
+///
+/// 'v1.5', 'v1.6', 'v1.7', 'v1.8'
+///
+/// Then the primary one is 'v1.8', the upstream one is 'v1.7', and the rest are
+/// unused directories that can be safely deleted at any time and by any
+/// process.
+///
+/// Contained within the top-level directory is a file named "lock" which is
+/// used for processes to take shared or exclusive locks for the contents of the
+/// top directory. While a \p UnifiedOnDiskCache is open it keeps a shared lock
+/// for the top-level directory; when it closes, if the primary sub-directory
+/// exceeded its limit, it attempts to get an exclusive lock in order to create
+/// a new empty primary directory; if it can't get the exclusive lock it gives
+/// up and lets the next \p UnifiedOnDiskCache instance that closes to attempt
+/// again.
+///
+/// The downside of this scheme is that while \p UnifiedOnDiskCache is open on a
+/// directory, by any process, the storage size in that directory will keep
+/// growing unrestricted. But the major benefit is that garbage-collection can
+/// be triggered on a directory concurrently, at any time and by any process,
+/// without affecting any active readers/writers in the same process or other
+/// processes.
+///
+/// The \c UnifiedOnDiskCache also provides validation and recovery on top of
+/// the underlying on-disk storage. The low-level storage is designed to remain
+/// coherent across regular process crashes, but may be invalid after power loss
+/// or similar system failures. \c UnifiedOnDiskCache::validateIfNeeded allows
+/// validating the contents once per boot and can recover by marking invalid
+/// data for garbage collection.
+///
+/// The data recovery described above requires exclusive access to the CAS, and
+/// it is an error to attempt recovery if the CAS is open in any process/thread.
+/// In order to maximize backwards compatibility with tools that do not perform
+/// validation before opening the CAS, we do not attempt to get exclusive access
+/// until recovery is actually performed, meaning as long as the data is valid
+/// it will not conflict with concurrent use.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "BuiltinCAS.h"
+#include "OnDiskCommon.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/raw_ostream.h"
+#include <optional>
+
+#if __has_include(<sys/sysctl.h>)
+#include <sys/sysctl.h>
+#endif
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+
+/// FIXME: When the version of \p DBDirPrefix is bumped up we need to figure out
+/// how to handle the leftover sub-directories of the previous version, within
+/// the \p UnifiedOnDiskCache::collectGarbage function.
+static constexpr StringLiteral DBDirPrefix = "v1.";
+
+static constexpr StringLiteral ValidationFilename = "v1.validation";
+static constexpr StringLiteral CorruptPrefix = "corrupt.";
+
+ObjectID UnifiedOnDiskCache::getObjectIDFromValue(ArrayRef<char> Value) {
+  // little endian encoded.
+  assert(Value.size() == sizeof(uint64_t));
+  return ObjectID::fromOpaqueData(support::endian::read64le(Value.data()));
+}
+
+UnifiedOnDiskCache::ValueBytes
+UnifiedOnDiskCache::getValueFromObjectID(ObjectID ID) {
+  // little endian encoded.
+  UnifiedOnDiskCache::ValueBytes ValBytes;
+  static_assert(ValBytes.size() == sizeof(ID.getOpaqueData()));
+  support::endian::write64le(ValBytes.data(), ID.getOpaqueData());
+  return ValBytes;
+}
+
+Expected<std::optional<ArrayRef<char>>>
+UnifiedOnDiskCache::faultInFromUpstreamKV(ArrayRef<uint8_t> Key) {
+  assert(UpstreamGraphDB);
+  assert(UpstreamKVDB);
+
+  std::optional<ArrayRef<char>> UpstreamValue;
+  if (Error E = UpstreamKVDB->get(Key).moveInto(UpstreamValue))
+    return std::move(E);
+  if (!UpstreamValue)
+    return std::nullopt;
+
+  // The value is the \p ObjectID in the context of the upstream
+  // \p OnDiskGraphDB instance. Translate it to the context of the primary
+  // \p OnDiskGraphDB instance.
+  ObjectID UpstreamID = getObjectIDFromValue(*UpstreamValue);
+  auto PrimaryID =
+      PrimaryGraphDB->getReference(UpstreamGraphDB->getDigest(UpstreamID));
+  if (LLVM_UNLIKELY(!PrimaryID))
+    return PrimaryID.takeError();
+  return PrimaryKVDB->put(Key, getValueFromObjectID(*PrimaryID));
+}
+
+/// \returns all the 'v<version>.<x>' names of sub-directories, sorted with
+/// ascending order of the integer after the dot. Corrupt directories, if
+/// included, will come first.
+static Expected<SmallVector<std::string, 4>>
+getAllDBDirs(StringRef Path, bool IncludeCorrupt = false) {
+  struct DBDir {
+    uint64_t Order;
+    std::string Name;
+  };
+  SmallVector<DBDir> FoundDBDirs;
+
+  std::error_code EC;
+  for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+       DirI.increment(EC)) {
+    if (DirI->type() != sys::fs::file_type::directory_file)
+      continue;
+    StringRef SubDir = sys::path::filename(DirI->path());
+    if (IncludeCorrupt && SubDir.starts_with(CorruptPrefix)) {
+      FoundDBDirs.push_back({0, std::string(SubDir)});
+      continue;
+    }
+    if (!SubDir.starts_with(DBDirPrefix))
+      continue;
+    uint64_t Order;
+    if (SubDir.substr(DBDirPrefix.size()).getAsInteger(10, Order))
+      return createStringError(inconvertibleErrorCode(),
+                               "unexpected directory " + DirI->path());
+    FoundDBDirs.push_back({Order, std::string(SubDir)});
+  }
+  if (EC)
+    return createFileError(Path, EC);
+
+  llvm::sort(FoundDBDirs, [](const DBDir &LHS, const DBDir &RHS) -> bool {
+    return LHS.Order <= RHS.Order;
+  });
+
+  SmallVector<std::string, 4> DBDirs;
+  for (DBDir &Dir : FoundDBDirs)
+    DBDirs.push_back(std::move(Dir.Name));
+  return DBDirs;
+}
+
+static Expected<SmallVector<std::string, 4>> getAllGarbageDirs(StringRef Path) {
+  auto DBDirs = getAllDBDirs(Path, /*IncludeCorrupt=*/true);
+  if (!DBDirs)
+    return DBDirs.takeError();
+
+  // FIXME: When the version of \p DBDirPrefix is bumped up we need to figure
+  // out how to handle the leftover sub-directories of the previous version.
+
+  for (unsigned Keep = 2; Keep > 0 && !DBDirs->empty(); --Keep) {
+    StringRef Back(DBDirs->back());
+    if (Back.starts_with(CorruptPrefix))
+      break;
+    DBDirs->pop_back();
+  }
+  return *DBDirs;
+}
+
+/// \returns Given a sub-directory named 'v<version>.<x>', it outputs the
+/// 'v<version>.<x+1>' name.
+static void getNextDBDirName(StringRef DBDir, llvm::raw_ostream &OS) {
+  assert(DBDir.starts_with(DBDirPrefix));
+  uint64_t Count;
+  bool Failed = DBDir.substr(DBDirPrefix.size()).getAsInteger(10, Count);
+  assert(!Failed);
+  (void)Failed;
+  OS << DBDirPrefix << Count + 1;
+}
+
+static Error validateOutOfProcess(StringRef LLVMCasBinary, StringRef RootPath,
+                                  bool CheckHash) {
+  SmallVector<StringRef> Args{LLVMCasBinary, "-cas", RootPath, "-validate"};
+  if (CheckHash)
+    Args.push_back("-check-hash");
+
+  llvm::SmallString<128> StdErrPath;
+  int StdErrFD = -1;
+  if (std::error_code EC = sys::fs::createTemporaryFile(
+          "llvm-cas-validate-stderr", "txt", StdErrFD, StdErrPath,
+          llvm::sys::fs::OF_Text))
+    return createStringError(EC, "failed to create temporary file");
+  FileRemover OutputRemover(StdErrPath.c_str());
+
+  std::optional<llvm::StringRef> Redirects[] = {
+      {""}, // stdin = /dev/null
+      {""}, // stdout = /dev/null
+      StdErrPath.str(),
+  };
+
+  std::string ErrMsg;
+  int Result =
+      sys::ExecuteAndWait(LLVMCasBinary, Args, /*Env=*/std::nullopt, Redirects,
+                          /*SecondsToWait=*/120, /*MemoryLimit=*/0, &ErrMsg);
+
+  if (Result == -1)
+    return createStringError("failed to exec " + join(Args, " ") + ": " +
+                             ErrMsg);
+  if (Result != 0) {
+    llvm::SmallString<64> Err("cas contents invalid");
+    if (!ErrMsg.empty()) {
+      Err += ": ";
+      Err += ErrMsg;
+    }
+    auto StdErrBuf = MemoryBuffer::getFile(StdErrPath.c_str());
+    if (StdErrBuf && !(*StdErrBuf)->getBuffer().empty()) {
+      Err += ": ";
+      Err += (*StdErrBuf)->getBuffer();
+    }
+    return createStringError(Err);
+  }
+  return Error::success();
+}
+
+static Error validateInProcess(StringRef RootPath, StringRef HashName,
+                               unsigned HashByteSize, bool CheckHash) {
+  std::shared_ptr<UnifiedOnDiskCache> UniDB;
+  if (Error E = UnifiedOnDiskCache::open(RootPath, std::nullopt, HashName,
+                                         HashByteSize)
+                    .moveInto(UniDB))
+    return E;
+  auto CAS = builtin::createObjectStoreFromUnifiedOnDiskCache(UniDB);
+  if (Error E = CAS->validate(CheckHash))
+    return E;
+  auto Cache = builtin::createActionCacheFromUnifiedOnDiskCache(UniDB);
+  if (Error E = Cache->validate())
+    return E;
+  return Error::success();
+}
+
+static Expected<uint64_t> getBootTime() {
+#if __has_include(<sys/sysctl.h>) && defined(KERN_BOOTTIME)
+  struct timeval TV;
+  size_t TVLen = sizeof(TV);
+  int KernBoot[2] = {CTL_KERN, KERN_BOOTTIME};
+  if (sysctl(KernBoot, 2, &TV, &TVLen, nullptr, 0) < 0)
+    return createStringError(llvm::errnoAsErrorCode(),
+                             "failed to get boottime");
+  if (TVLen != sizeof(TV))
+    return createStringError("sysctl kern.boottime unexpected format");
+  return TV.tv_sec;
+#elif defined(__linux__)
+  // Use the mtime for /proc, which is recreated during system boot.
+  // We could also read /proc/stat and search for 'btime'.
+  sys::fs::file_status Status;
+  if (std::error_code EC = sys::fs::status("/proc", Status))
+    return createFileError("/proc", EC);
+  return Status.getLastModificationTime().time_since_epoch().count();
+#else
+  llvm::report_fatal_error("getBootTime unimplemented");
+#endif
+}
+
+Expected<ValidationResult> UnifiedOnDiskCache::validateIfNeeded(
+    StringRef RootPath, StringRef HashName, unsigned HashByteSize,
+    bool CheckHash, bool AllowRecovery, bool ForceValidation,
+    std::optional<StringRef> LLVMCasBinaryPath) {
+  if (std::error_code EC = sys::fs::create_directories(RootPath))
+    return createFileError(RootPath, EC);
+
+  SmallString<256> PathBuf(RootPath);
+  sys::path::append(PathBuf, ValidationFilename);
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForReadWrite(
+          PathBuf, FD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+    return createFileError(PathBuf, EC);
+  assert(FD != -1);
+
+  sys::fs::file_t File = sys::fs::convertFDToNativeFile(FD);
+  auto CloseFile = make_scope_exit([&]() { sys::fs::closeFile(File); });
+
+  if (std::error_code EC = lockFileThreadSafe(FD, sys::fs::LockKind::Exclusive))
+    return createFileError(PathBuf, EC);
+  auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(FD); });
+
+  SmallString<8> Bytes;
+  if (Error E = sys::fs::readNativeFileToEOF(File, Bytes))
+    return createFileError(PathBuf, std::move(E));
+
+  uint64_t ValidationBootTime = 0;
+  if (!Bytes.empty() &&
+      StringRef(Bytes).trim().getAsInteger(10, ValidationBootTime))
+    return createFileError(PathBuf, errc::illegal_byte_sequence,
+                           "expected integer");
+
+  static uint64_t BootTime = 0;
+  if (BootTime == 0)
+    if (Error E = getBootTime().moveInto(BootTime))
+      return std::move(E);
+
+  std::string LogValidationError;
+
+  if (ValidationBootTime == BootTime && !ForceValidation)
+    return ValidationResult::Skipped;
+
+  // Validate!
+  bool NeedsRecovery = false;
+  if (Error E =
+          LLVMCasBinaryPath
+              ? validateOutOfProcess(*LLVMCasBinaryPath, RootPath, CheckHash)
+              : validateInProcess(RootPath, HashName, HashByteSize,
+                                  CheckHash)) {
+    if (AllowRecovery) {
+      consumeError(std::move(E));
+      NeedsRecovery = true;
+    } else {
+      return std::move(E);
+    }
+  }
+
+  if (NeedsRecovery) {
+    sys::path::remove_filename(PathBuf);
+    sys::path::append(PathBuf, "lock");
+
+    int LockFD = -1;
+    if (std::error_code EC = sys::fs::openFileForReadWrite(
+            PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+      return createFileError(PathBuf, EC);
+    sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+    auto CloseLock = make_scope_exit([&]() { sys::fs::closeFile(LockFile); });
+    if (std::error_code EC = tryLockFileThreadSafe(LockFD)) {
+      if (EC == std::errc::no_lock_available)
+        return createFileError(
+            PathBuf, EC,
+            "CAS validation requires exclusive access but CAS was in use");
+      return createFileError(PathBuf, EC);
+    }
+    auto UnlockFD = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); });
+
+    auto DBDirs = getAllDBDirs(RootPath);
+    if (!DBDirs)
+      return DBDirs.takeError();
+
+    for (StringRef DBDir : *DBDirs) {
+      sys::path::remove_filename(PathBuf);
+      sys::path::append(PathBuf, DBDir);
+      std::error_code EC;
+      int Attempt = 0, MaxAttempts = 100;
+      SmallString<128> GCPath;
+      for (; Attempt < MaxAttempts; ++Attempt) {
+        GCPath.assign(RootPath);
+        sys::path::append(GCPath, CorruptPrefix + std::to_string(Attempt) +
+                                      "." + DBDir);
+        EC = sys::fs::rename(PathBuf, GCPath);
+        // Darwin uses ENOTEMPTY. Linux may return either ENOTEMPTY or EEXIST.
+        if (EC != errc::directory_not_empty && EC != errc::file_exists)
+          break;
+      }
+      if (Attempt == MaxAttempts)
+        return createStringError(
+            EC, "rename " + PathBuf +
+                    " failed: too many CAS directories awaiting pruning");
+      if (EC)
+        return createStringError(EC, "rename " + PathBuf + " to " + GCPath +
+                                         " failed: " + EC.message());
+    }
+  }
+
+  if (ValidationBootTime != BootTime) {
+    // Fix filename in case we have error to report.
+    sys::path::remove_filename(PathBuf);
+    sys::path::append(PathBuf, ValidationFilename);
+    if (std::error_code EC = sys::fs::resize_file(FD, 0))
+      return createFileError(PathBuf, EC);
+    raw_fd_ostream OS(FD, /*shouldClose=*/false);
+    OS.seek(0); // resize does not reset position
+    OS << BootTime << '\n';
+    if (OS.has_error())
+      return createFileError(PathBuf, OS.error());
+  }
+
+  return NeedsRecovery ? ValidationResult::Recovered : ValidationResult::Valid;
+}
+
+Expected<std::unique_ptr<UnifiedOnDiskCache>>
+UnifiedOnDiskCache::open(StringRef RootPath, std::optional<uint64_t> SizeLimit,
+                         StringRef HashName, unsigned HashByteSize,
+                         OnDiskGraphDB::FaultInPolicy FaultInPolicy) {
+  if (std::error_code EC = sys::fs::create_directories(RootPath))
+    return createFileError(RootPath, EC);
+
+  SmallString<256> PathBuf(RootPath);
+  sys::path::append(PathBuf, "lock");
+  int LockFD = -1;
+  if (std::error_code EC = sys::fs::openFileForReadWrite(
+          PathBuf, LockFD, sys::fs::CD_OpenAlways, sys::fs::OF_None))
+    return createFileError(PathBuf, EC);
+  assert(LockFD != -1);
+  // Locking the directory using shared lock, which will prevent other processes
+  // from creating a new chain (essentially while a \p UnifiedOnDiskCache
+  // instance holds a shared lock the storage for the primary directory will
+  // grow unrestricted).
+  if (std::error_code EC =
+          lockFileThreadSafe(LockFD, sys::fs::LockKind::Shared))
+    return createFileError(PathBuf, EC);
+
+  auto DBDirs = getAllDBDirs(RootPath);
+  if (!DBDirs)
+    return DBDirs.takeError();
+  if (DBDirs->empty())
+    DBDirs->push_back((Twine(DBDirPrefix) + "1").str());
+
+  assert(!DBDirs->empty());
+
+  /// If there is only one directory open databases on it. If there are 2 or
+  /// more directories, get the most recent directories and chain them, with the
+  /// most recent being the primary one. The remaining directories are unused
+  /// data than can be garbage-collected.
+  auto UniDB = std::unique_ptr<UnifiedOnDiskCache>(new UnifiedOnDiskCache());
+  std::unique_ptr<OnDiskGraphDB> UpstreamGraphDB;
+  std::unique_ptr<OnDiskKeyValueDB> UpstreamKVDB;
+  if (DBDirs->size() > 1) {
+    StringRef UpstreamDir = *(DBDirs->end() - 2);
+    PathBuf = RootPath;
+    sys::path::append(PathBuf, UpstreamDir);
+    if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+                                      /*UpstreamDB=*/nullptr, FaultInPolicy)
+                      .moveInto(UpstreamGraphDB))
+      return std::move(E);
+    if (Error E = OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+                                         /*ValueName=*/"objectid",
+                                         /*ValueSize=*/sizeof(uint64_t))
+                      .moveInto(UpstreamKVDB))
+      return std::move(E);
+  }
+
+  StringRef PrimaryDir = *(DBDirs->end() - 1);
+  PathBuf = RootPath;
+  sys::path::append(PathBuf, PrimaryDir);
+  std::unique_ptr<OnDiskGraphDB> PrimaryGraphDB;
+  if (Error E = OnDiskGraphDB::open(PathBuf, HashName, HashByteSize,
+                                    UpstreamGraphDB.get(), FaultInPolicy)
+                    .moveInto(PrimaryGraphDB))
+    return std::move(E);
+  std::unique_ptr<OnDiskKeyValueDB> PrimaryKVDB;
+  // \p UnifiedOnDiskCache does manual chaining for key-value requests,
+  // including an extra translation step of the value during fault-in.
+  if (Error E =
+          OnDiskKeyValueDB::open(PathBuf, HashName, HashByteSize,
+                                 /*ValueName=*/"objectid",
+                                 /*ValueSize=*/sizeof(uint64_t), UniDB.get())
+              .moveInto(PrimaryKVDB))
+    return std::move(E);
+
+  UniDB->RootPath = RootPath;
+  UniDB->SizeLimit = SizeLimit.value_or(0);
+  UniDB->LockFD = LockFD;
+  UniDB->NeedsGarbageCollection = DBDirs->size() > 2;
+  UniDB->PrimaryDBDir = PrimaryDir;
+  UniDB->UpstreamGraphDB = std::move(UpstreamGraphDB);
+  UniDB->PrimaryGraphDB = std::move(PrimaryGraphDB);
+  UniDB->UpstreamKVDB = std::move(UpstreamKVDB);
+  UniDB->PrimaryKVDB = std::move(PrimaryKVDB);
+
+  return std::move(UniDB);
+}
+
+void UnifiedOnDiskCache::setSizeLimit(std::optional<uint64_t> SizeLimit) {
+  this->SizeLimit = SizeLimit.value_or(0);
+}
+
+uint64_t UnifiedOnDiskCache::getStorageSize() const {
+  uint64_t TotalSize = getPrimaryStorageSize();
+  if (UpstreamGraphDB)
+    TotalSize += UpstreamGraphDB->getStorageSize();
+  if (UpstreamKVDB)
+    TotalSize += UpstreamKVDB->getStorageSize();
+  return TotalSize;
+}
+
+uint64_t UnifiedOnDiskCache::getPrimaryStorageSize() const {
+  return PrimaryGraphDB->getStorageSize() + PrimaryKVDB->getStorageSize();
+}
+
+bool UnifiedOnDiskCache::hasExceededSizeLimit() const {
+  uint64_t CurSizeLimit = SizeLimit;
+  if (!CurSizeLimit)
+    return false;
+
+  // If the hard limit is beyond 85%, declare above limit and request clean up.
+  unsigned CurrentPercent =
+      std::max(PrimaryGraphDB->getHardStorageLimitUtilization(),
+               PrimaryKVDB->getHardStorageLimitUtilization());
+  if (CurrentPercent > 85)
+    return true;
+
+  // We allow each of the directories in the chain to reach up to half the
+  // intended size limit. Check whether the primary directory has exceeded half
+  // the limit or not, in order to decide whether we need to start a new chain.
+  //
+  // We could check the size limit against the sum of sizes of both the primary
+  // and upstream directories but then if the upstream is significantly larger
+  // than the intended limit, it would trigger a new chain to be created before
+  // the primary has reached its own limit. Essentially in such situation we
+  // prefer reclaiming the storage later in order to have more consistent cache
+  // hits behavior.
+  return (CurSizeLimit / 2) < getPrimaryStorageSize();
+}
+
+Error UnifiedOnDiskCache::close(bool CheckSizeLimit) {
+  if (LockFD == -1)
+    return Error::success(); // already closed.
+  auto CloseLock = make_scope_exit([&]() {
+    assert(LockFD >= 0);
+    sys::fs::file_t LockFile = sys::fs::convertFDToNativeFile(LockFD);
+    sys::fs::closeFile(LockFile);
+    LockFD = -1;
+  });
+
+  bool ExceededSizeLimit = CheckSizeLimit ? hasExceededSizeLimit() : false;
+  UpstreamKVDB.reset();
+  PrimaryKVDB.reset();
+  UpstreamGraphDB.reset();
+  PrimaryGraphDB.reset();
+  if (std::error_code EC = unlockFileThreadSafe(LockFD))
+    return createFileError(RootPath, EC);
+
+  if (!ExceededSizeLimit)
+    return Error::success();
+
+  // The primary directory exceeded its intended size limit. Try to get an
+  // exclusive lock in order to create a new primary directory for next time
+  // this \p UnifiedOnDiskCache path is opened.
+
+  if (std::error_code EC = tryLockFileThreadSafe(
+          LockFD, std::chrono::milliseconds(0), sys::fs::LockKind::Exclusive)) {
+    if (EC == errc::no_lock_available)
+      return Error::success(); // couldn't get exclusive lock, give up.
+    return createFileError(RootPath, EC);
+  }
+  auto UnlockFile = make_scope_exit([&]() { unlockFileThreadSafe(LockFD); });
+
+  // Managed to get an exclusive lock which means there are no other open
+  // \p UnifiedOnDiskCache instances for the same path, so we can safely start a
+  // new primary directory. To start a new primary directory we just have to
+  // create a new empty directory with the next consecutive index; since this is
+  // an atomic operation we will leave the top-level directory in a consistent
+  // state even if the process dies during this code-path.
+
+  SmallString<256> PathBuf(RootPath);
+  raw_svector_ostream OS(PathBuf);
+  OS << sys::path::get_separator();
+  getNextDBDirName(PrimaryDBDir, OS);
+  if (std::error_code EC = sys::fs::create_directory(PathBuf))
+    return createFileError(PathBuf, EC);
+
+  NeedsGarbageCollection = true;
+  return Error::success();
+}
+
+UnifiedOnDiskCache::UnifiedOnDiskCache() = default;
+
+UnifiedOnDiskCache::~UnifiedOnDiskCache() { consumeError(close()); }
+
+Error UnifiedOnDiskCache::collectGarbage(StringRef Path) {
+  auto DBDirs = getAllGarbageDirs(Path);
+  if (!DBDirs)
+    return DBDirs.takeError();
+
+  SmallString<256> PathBuf(Path);
+  for (StringRef UnusedSubDir : *DBDirs) {
+    sys::path::append(PathBuf, UnusedSubDir);
+    if (std::error_code EC = sys::fs::remove_directories(PathBuf))
+      return createFileError(PathBuf, EC);
+    sys::path::remove_filename(PathBuf);
+  }
+  return Error::success();
+}
+
+Error UnifiedOnDiskCache::collectGarbage() { return collectGarbage(RootPath); }
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 53f1cfe24a68d..6412949948c07 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/MemoryModelRelaxationAnnotations.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
@@ -1259,8 +1260,7 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
   BasicBlock *BB = Builder.GetInsertBlock();
   Function *F = BB->getParent();
 
-  assert(AddrAlign >=
-             F->getDataLayout().getTypeStoreSize(ResultTy) &&
+  assert(AddrAlign >= F->getDataLayout().getTypeStoreSize(ResultTy) &&
          "Expected at least natural alignment at this point.");
 
   // Given: atomicrmw some_op iN* %addr, iN %incr ordering
@@ -1295,7 +1295,13 @@ Value *AtomicExpandImpl::insertRMWLLSCLoop(
       TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
   Value *TryAgain = Builder.CreateICmpNE(
       StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
-  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Instruction *CondBr = Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  // Atomic RMW expands to a Load-linked / Store-Conditional loop, because it is
+  // hard to predict precise branch weigths we mark the branch as "unknown"
+  // (50/50) to prevent misleading optimizations.
+  setExplicitlyUnknownBranchWeightsIfProfiled(*CondBr, *F, DEBUG_TYPE);
 
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
   return Loaded;
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 1fc90d0852aad..4fd220481cedb 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -294,6 +294,10 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) {
   MachinePreds[Edge].push_back(NewPred);
 }
 
+static bool targetSupportsBF16Type(const MachineFunction *MF) {
+  return MF->getTarget().getTargetTriple().isSPIRV();
+}
+
 static bool containsBF16Type(const User &U) {
   // BF16 cannot currently be represented by LLT, to avoid miscompiles we
   // prevent any instructions using them. FIXME: This can be removed once LLT
@@ -306,7 +310,7 @@ static bool containsBF16Type(const User &U) {
 
 bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
                                      MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   // Get or create a virtual register for each value.
@@ -328,7 +332,7 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
 
 bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   Register Op0 = getOrCreateVReg(*U.getOperand(0));
@@ -348,7 +352,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   auto *CI = cast<CmpInst>(&U);
@@ -1569,7 +1573,7 @@ bool IRTranslator::translateBitCast(const User &U,
 
 bool IRTranslator::translateCast(unsigned Opcode, const User &U,
                                  MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   uint32_t Flags = 0;
@@ -2688,7 +2692,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
 
 bool IRTranslator::translateInlineAsm(const CallBase &CB,
                                       MachineIRBuilder &MIRBuilder) {
-  if (containsBF16Type(CB))
+  if (containsBF16Type(CB) && !targetSupportsBF16Type(MF))
     return false;
 
   const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
@@ -2779,7 +2783,7 @@ bool IRTranslator::translateCallBase(const CallBase &CB,
 }
 
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
-  if (!MF->getTarget().getTargetTriple().isSPIRV() && containsBF16Type(U))
+  if (containsBF16Type(U) && !targetSupportsBF16Type(MF))
     return false;
 
   const CallInst &CI = cast<CallInst>(U);
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 4b4df98024f4a..637acd61c8a5f 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -109,8 +109,10 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
   if (auto *CI = dyn_cast<ConstantInt>(NumericConstant)) {
     if (CI->getBitWidth() > 64)
       MIB.addCImm(CI);
-    else
+    else if (CI->getBitWidth() == 1)
       MIB.addImm(CI->getZExtValue());
+    else
+      MIB.addImm(CI->getSExtValue());
   } else if (auto *CFP = dyn_cast<ConstantFP>(NumericConstant)) {
     MIB.addFPImm(CFP);
   } else if (isa<ConstantPointerNull>(NumericConstant)) {
diff --git a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
index c31454a8affda..b5d3092ee84d8 100644
--- a/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
+++ b/llvm/lib/CodeGen/MachineFunctionSplitter.cpp
@@ -129,6 +129,9 @@ static bool isColdBlock(const MachineBasicBlock &MBB,
 }
 
 bool MachineFunctionSplitter::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
   // Do not split functions when -basic-block-sections=all is specified.
   if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All)
     return false;
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index da29ffc9d2fed..88d81993fbe55 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -136,6 +136,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   SmallSetVector<Register, 8> ExternUses;
   SmallSet<Register, 8> KilledUseSet;
   SmallSet<Register, 8> UndefUseSet;
+  SmallVector<std::pair<Register, Register>> TiedOperands;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
     // Debug instructions have no effects to track.
     if (MII->isDebugInstr())
@@ -161,6 +162,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
           // External def is now killed.
           KilledUseSet.insert(Reg);
         }
+        if (MO.isTied() && Reg.isVirtual()) {
+          // Record tied operand constraints that involve virtual registers so
+          // that bundles that are formed pre-register allocation reflect the
+          // relevant constraints.
+          unsigned TiedIdx = MII->findTiedOperandIdx(MO.getOperandNo());
+          MachineOperand &TiedMO = MII->getOperand(TiedIdx);
+          Register DefReg = TiedMO.getReg();
+          TiedOperands.emplace_back(DefReg, Reg);
+        }
       }
     }
 
@@ -203,7 +213,17 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     bool isKill = KilledUseSet.contains(Reg);
     bool isUndef = UndefUseSet.contains(Reg);
     MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
-               getImplRegState(true));
+                        getImplRegState(true));
+  }
+
+  for (auto [DefReg, UseReg] : TiedOperands) {
+    unsigned DefIdx =
+        std::distance(LocalDefs.begin(), llvm::find(LocalDefs, DefReg));
+    unsigned UseIdx =
+        std::distance(ExternUses.begin(), llvm::find(ExternUses, UseReg));
+    assert(DefIdx < LocalDefs.size());
+    assert(UseIdx < ExternUses.size());
+    MIB->tieOperands(DefIdx, LocalDefs.size() + UseIdx);
   }
 }
 
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index f18c051142960..73993705c4a7b 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -2559,7 +2559,7 @@ init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
     for (unsigned i = 0; i < ResourceCount; ++i) {
       ReservedCyclesIndex[i] = NumUnits;
       NumUnits += SchedModel->getProcResource(i)->NumUnits;
-      if (isUnbufferedGroup(i)) {
+      if (isReservedGroup(i)) {
         auto SubUnits = SchedModel->getProcResource(i)->SubUnitsIdxBegin;
         for (unsigned U = 0, UE = SchedModel->getProcResource(i)->NumUnits;
              U != UE; ++U)
@@ -2631,7 +2631,7 @@ SchedBoundary::getNextResourceCycle(const MCSchedClassDesc *SC, unsigned PIdx,
   assert(NumberOfInstances > 0 &&
          "Cannot have zero instances of a ProcResource");
 
-  if (isUnbufferedGroup(PIdx)) {
+  if (isReservedGroup(PIdx)) {
     // If any subunits are used by the instruction, report that the
     // subunits of the resource group are available at the first cycle
     // in which the unit is available, effectively removing the group
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index c0710c467a2e6..fdf10480b6e05 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2584,6 +2584,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       report("Extra explicit operand on non-variadic instruction", MO, MONum);
   }
 
+  // Verify earlyClobber def operand
+  if (MCID.getOperandConstraint(MONum, MCOI::EARLY_CLOBBER) != -1) {
+    if (!MO->isReg())
+      report("Early clobber must be a register", MI);
+    if (!MO->isEarlyClobber())
+      report("Missing earlyClobber flag", MI);
+  }
+
   switch (MO->getType()) {
   case MachineOperand::MO_Register: {
     // Verify debug flag on debug instructions. Check this first because reg0
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 38f6deb39ddf3..99f76936a180f 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1600,6 +1600,22 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
       SlotIndex DefIndex =
           CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
       VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
+
+      // Refine the subranges that are now defined by the remat.
+      // This will split existing subranges if necessary.
+      DstInt.refineSubRanges(
+          Alloc, DstMask,
+          [&DefIndex, &Alloc](LiveInterval::SubRange &SR) {
+            // We know that this lane is defined by this instruction,
+            // but at this point it might not be live because it was not defined
+            // by the original instruction. This happens when the
+            // rematerialization widens the defined register. Assign that lane a
+            // dead def so that the interferences are properly modeled.
+            if (!SR.liveAt(DefIndex))
+              SR.createDeadDef(DefIndex, Alloc);
+          },
+          *LIS->getSlotIndexes(), *TRI);
+
       for (LiveInterval::SubRange &SR : DstInt.subranges()) {
         if ((SR.LaneMask & DstMask).none()) {
           LLVM_DEBUG(dbgs()
@@ -1617,14 +1633,6 @@ bool RegisterCoalescer::reMaterializeDef(const CoalescerPair &CP,
           // updateRegDefUses. The original subrange def may have only undefed
           // some lanes.
           UpdatedSubRanges = true;
-        } else {
-          // We know that this lane is defined by this instruction,
-          // but at this point it might not be live because it was not defined
-          // by the original instruction. This happens when the
-          // rematerialization widens the defined register. Assign that lane a
-          // dead def so that the interferences are properly modeled.
-          if (!SR.liveAt(DefIndex))
-            SR.createDeadDef(DefIndex, Alloc);
         }
       }
       if (UpdatedSubRanges)
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index bb10cf687db8d..d84c3fb05bb24 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -733,6 +733,8 @@ MachineOperand GetMOForConstDbgOp(const SDDbgOperand &Op) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getBitWidth() > 64)
       return MachineOperand::CreateCImm(CI);
+    if (CI->getBitWidth() == 1)
+      return MachineOperand::CreateImm(CI->getZExtValue());
     return MachineOperand::CreateImm(CI->getSExtValue());
   }
   if (const ConstantFP *CF = dyn_cast<ConstantFP>(V))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 431a81002074f..316aacdf6978e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -163,6 +163,8 @@ class SelectionDAGLegalize {
                                    RTLIB::Libcall CallI128);
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
+  SDValue ExpandSincosStretLibCall(SDNode *Node) const;
+
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
                            const SDLoc &dl);
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
@@ -2423,6 +2425,101 @@ static bool useSinCos(SDNode *Node) {
   return false;
 }
 
+SDValue SelectionDAGLegalize::ExpandSincosStretLibCall(SDNode *Node) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Node);
+  SDValue Arg = Node->getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
+  RTLIB::LibcallImpl SincosStret = TLI.getLibcallImpl(LC);
+  if (SincosStret == RTLIB::Unsupported)
+    return SDValue();
+
+  /// There are 3 different ABI cases to handle:
+  /// - Direct return of separate fields in registers
+  /// - Single return as vector elements
+  /// - sret struct
+
+  const RTLIB::RuntimeLibcallsInfo &CallsInfo = TLI.getRuntimeLibcallsInfo();
+
+  const DataLayout &DL = DAG.getDataLayout();
+
+  auto [FuncTy, FuncAttrs] = CallsInfo.getFunctionTy(
+      *DAG.getContext(), TM.getTargetTriple(), DL, SincosStret);
+
+  Type *SincosStretRetTy = FuncTy->getReturnType();
+  CallingConv::ID CallConv = CallsInfo.getLibcallImplCallingConv(SincosStret);
+  StringRef LibcallImplName = CallsInfo.getLibcallImplName(SincosStret);
+
+  SDValue Callee = DAG.getExternalSymbol(LibcallImplName.data(),
+                                         TLI.getProgramPointerTy(DL));
+
+  TargetLowering::ArgListTy Args;
+  SDValue SRet;
+
+  int FrameIdx;
+  if (FuncTy->getParamType(0)->isPointerTy()) {
+    // Uses sret
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+
+    AttributeSet PtrAttrs = FuncAttrs.getParamAttrs(0);
+    Type *StructTy = PtrAttrs.getStructRetType();
+    const uint64_t ByteSize = DL.getTypeAllocSize(StructTy);
+    const Align StackAlign = DL.getPrefTypeAlign(StructTy);
+
+    FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+    SRet = DAG.getFrameIndex(FrameIdx, TLI.getFrameIndexTy(DL));
+
+    TargetLowering::ArgListEntry Entry(SRet, FuncTy->getParamType(0));
+    Entry.IsSRet = true;
+    Entry.IndirectType = StructTy;
+    Entry.Alignment = StackAlign;
+
+    Args.push_back(Entry);
+    Args.emplace_back(Arg, FuncTy->getParamType(1));
+  } else {
+    Args.emplace_back(Arg, FuncTy->getParamType(0));
+  }
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallConv, SincosStretRetTy, Callee, std::move(Args))
+      .setIsPostTypeLegalization();
+
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+  if (SRet) {
+    MachinePointerInfo PtrInfo =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
+    SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, PtrInfo);
+
+    TypeSize StoreSize = ArgVT.getStoreSize();
+
+    // Address of cos field.
+    SDValue Add = DAG.getObjectPtrOffset(dl, SRet, StoreSize);
+    SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+                                  PtrInfo.getWithOffset(StoreSize));
+
+    SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+    return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0),
+                       LoadCos.getValue(0));
+  }
+
+  if (!CallResult.first.getValueType().isVector())
+    return CallResult.first;
+
+  SDValue SinVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(0, dl));
+  SDValue CosVal =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
+                  DAG.getVectorIdxConstant(1, dl));
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
+}
+
 SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
   SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
@@ -4730,6 +4827,18 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   case ISD::FSINCOS:
   case ISD::FSINCOSPI: {
     EVT VT = Node->getValueType(0);
+
+    if (Node->getOpcode() == ISD::FSINCOS) {
+      RTLIB::Libcall SincosStret = RTLIB::getSINCOS_STRET(VT);
+      if (SincosStret != RTLIB::UNKNOWN_LIBCALL) {
+        if (SDValue Expanded = ExpandSincosStretLibCall(Node)) {
+          Results.push_back(Expanded);
+          Results.push_back(Expanded.getValue(1));
+          break;
+        }
+      }
+    }
+
     RTLIB::Libcall LC = Node->getOpcode() == ISD::FSINCOS
                             ? RTLIB::getSINCOS(VT)
                             : RTLIB::getSINCOSPI(VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a52265055c88a..fa0c899dfcc27 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8958,9 +8958,8 @@ bool SelectionDAGBuilder::canTailCall(const CallBase &CB) const {
   // Avoid emitting tail calls in functions with the disable-tail-calls
   // attribute.
   const Function *Caller = CB.getParent()->getParent();
-  if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
-          "true" &&
-      !isMustTailCall)
+  if (!isMustTailCall &&
+      Caller->getFnAttribute("disable-tail-calls").getValueAsBool())
     return false;
 
   // We can't tail call inside a function with a swifterror argument. Lowering
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index 6c78ef05e1b61..7496c5a084da4 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -704,7 +704,9 @@ void DWARFDie::dump(raw_ostream &OS, unsigned Indent,
           DIDumpOptions ChildDumpOpts = DumpOpts;
           ChildDumpOpts.ShowParents = false;
           while (Child) {
-            Child.dump(OS, Indent + 2, ChildDumpOpts);
+            if (DumpOpts.FilterChildTag.empty() ||
+                llvm::is_contained(DumpOpts.FilterChildTag, Child.getTag()))
+              Child.dump(OS, Indent + 2, ChildDumpOpts);
             Child = Child.getSibling();
           }
         }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 3b8fde8aff45f..cd39970f5111f 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -4171,6 +4171,16 @@ SwitchInstProfUpdateWrapper::removeCase(SwitchInst::CaseIt I) {
   return SI.removeCase(I);
 }
 
+void SwitchInstProfUpdateWrapper::replaceDefaultDest(SwitchInst::CaseIt I) {
+  auto *DestBlock = I->getCaseSuccessor();
+  if (Weights) {
+    auto Weight = getSuccessorWeight(I->getCaseIndex() + 1);
+    (*Weights)[0] = Weight.value();
+  }
+
+  SI.setDefaultDest(DestBlock);
+}
+
 void SwitchInstProfUpdateWrapper::addCase(
     ConstantInt *OnVal, BasicBlock *Dest,
     SwitchInstProfUpdateWrapper::CaseWeightOpt W) {
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index 77af29b9d70f6..2ce5719228a0d 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/RuntimeLibcalls.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringTable.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/xxhash.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
@@ -72,3 +74,80 @@ bool RuntimeLibcallsInfo::darwinHasExp10(const Triple &TT) {
     return false;
   }
 }
+
+std::pair<FunctionType *, AttributeList>
+RuntimeLibcallsInfo::getFunctionTy(LLVMContext &Ctx, const Triple &TT,
+                                   const DataLayout &DL,
+                                   RTLIB::LibcallImpl LibcallImpl) const {
+  static constexpr Attribute::AttrKind CommonFnAttrs[] = {
+      Attribute::NoCallback, Attribute::NoFree, Attribute::NoSync,
+      Attribute::NoUnwind, Attribute::WillReturn};
+
+  switch (LibcallImpl) {
+  case RTLIB::impl___sincos_stret:
+  case RTLIB::impl___sincosf_stret: {
+    if (!darwinHasSinCosStret(TT)) // Non-darwin currently unexpected
+      return {};
+
+    Type *ScalarTy = LibcallImpl == RTLIB::impl___sincosf_stret
+                         ? Type::getFloatTy(Ctx)
+                         : Type::getDoubleTy(Ctx);
+
+    AttrBuilder FuncAttrBuilder(Ctx);
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+
+    const bool UseSret =
+        TT.isX86_32() || ((TT.isARM() || TT.isThumb()) &&
+                          ARM::computeTargetABI(TT) == ARM::ARM_ABI_APCS);
+
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::argumentOrErrnoMemOnly(
+        UseSret ? ModRefInfo::Mod : ModRefInfo::NoModRef, ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    if (UseSret) {
+      AttrBuilder AttrBuilder(Ctx);
+      StructType *StructTy = StructType::get(ScalarTy, ScalarTy);
+      AttrBuilder.addStructRetAttr(StructTy);
+      AttrBuilder.addAlignmentAttr(DL.getABITypeAlign(StructTy));
+      FunctionType *FuncTy = FunctionType::get(
+          Type::getVoidTy(Ctx), {DL.getAllocaPtrType(Ctx), ScalarTy}, false);
+
+      return {FuncTy, Attrs.addParamAttributes(Ctx, 0, AttrBuilder)};
+    }
+
+    Type *RetTy =
+        LibcallImpl == RTLIB::impl___sincosf_stret && TT.isX86_64()
+            ? static_cast<Type *>(FixedVectorType::get(ScalarTy, 2))
+            : static_cast<Type *>(StructType::get(ScalarTy, ScalarTy));
+
+    return {FunctionType::get(RetTy, {ScalarTy}, false), Attrs};
+  }
+  case RTLIB::impl_sqrtf:
+  case RTLIB::impl_sqrt: {
+    AttrBuilder FuncAttrBuilder(Ctx);
+
+    for (Attribute::AttrKind Attr : CommonFnAttrs)
+      FuncAttrBuilder.addAttribute(Attr);
+    FuncAttrBuilder.addMemoryAttr(MemoryEffects::errnoMemOnly(ModRefInfo::Mod));
+
+    AttributeList Attrs;
+    Attrs = Attrs.addFnAttributes(Ctx, FuncAttrBuilder);
+
+    Type *ScalarTy = LibcallImpl == RTLIB::impl_sqrtf ? Type::getFloatTy(Ctx)
+                                                      : Type::getDoubleTy(Ctx);
+    FunctionType *FuncTy = FunctionType::get(ScalarTy, {ScalarTy}, false);
+
+    Attrs = Attrs.addRetAttribute(
+        Ctx, Attribute::getWithNoFPClass(Ctx, fcNegInf | fcNegSubnormal |
+                                                  fcNegNormal));
+    return {FuncTy, Attrs};
+  }
+  default:
+    return {};
+  }
+
+  return {};
+}
diff --git a/llvm/lib/MC/SPIRVObjectWriter.cpp b/llvm/lib/MC/SPIRVObjectWriter.cpp
index 5e3713778286f..d693ea33d8d7b 100644
--- a/llvm/lib/MC/SPIRVObjectWriter.cpp
+++ b/llvm/lib/MC/SPIRVObjectWriter.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSPIRVObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
@@ -17,8 +18,10 @@ using namespace llvm;
 void SPIRVObjectWriter::writeHeader(const MCAssembler &Asm) {
   constexpr uint32_t MagicNumber = 0x07230203;
   constexpr uint32_t GeneratorID = 43;
-  constexpr uint32_t GeneratorMagicNumber =
-      (GeneratorID << 16) | (LLVM_VERSION_MAJOR);
+  const uint32_t GeneratorMagicNumber =
+      Asm.getContext().getTargetTriple().getVendor() == Triple::AMD
+          ? UINT16_MAX
+          : ((GeneratorID << 16) | (LLVM_VERSION_MAJOR));
   constexpr uint32_t Schema = 0;
 
   W.write<uint32_t>(MagicNumber);
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index e09dc947c2779..c2f4560c06c0d 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -1978,20 +1978,42 @@ uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
   return SectSize;
 }
 
-ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint32_t Offset,
+ArrayRef<uint8_t> MachOObjectFile::getSectionContents(uint64_t Offset,
                                                       uint64_t Size) const {
   return arrayRefFromStringRef(getData().substr(Offset, Size));
 }
 
 Expected<ArrayRef<uint8_t>>
 MachOObjectFile::getSectionContents(DataRefImpl Sec) const {
-  uint32_t Offset;
+  uint64_t Offset;
   uint64_t Size;
 
   if (is64Bit()) {
     MachO::section_64 Sect = getSection64(Sec);
     Offset = Sect.offset;
     Size = Sect.size;
+    // Check for large mach-o files where the section contents might exceed
+    // 4GB. MachO::section_64 objects only have 32 bit file offsets to the
+    // section contents and can overflow in dSYM files. We can track this and
+    // adjust the section offset to be 64 bit safe. If sections overflow then
+    // section ordering is enforced. If sections are not ordered, then an error
+    // will be returned stopping invalid section data from being returned.
+    uint64_t PrevTrueOffset = 0;
+    uint64_t SectOffsetAdjust = 0;
+    for (uint32_t SectIdx = 0; SectIdx < Sec.d.a; ++SectIdx) {
+      MachO::section_64 CurrSect =
+          getStruct<MachO::section_64>(*this, Sections[SectIdx]);
+      uint64_t CurrTrueOffset = (uint64_t)CurrSect.offset + SectOffsetAdjust;
+      if ((SectOffsetAdjust > 0) && (PrevTrueOffset > CurrTrueOffset))
+        return malformedError("section data exceeds 4GB and section file "
+                              "offsets are not ordered");
+      const uint64_t EndSectFileOffset =
+          (uint64_t)CurrSect.offset + CurrSect.size;
+      if (EndSectFileOffset > UINT32_MAX)
+        SectOffsetAdjust += EndSectFileOffset & 0xFFFFFFFF00000000ull;
+      PrevTrueOffset = CurrTrueOffset;
+    }
+    Offset += SectOffsetAdjust;
   } else {
     MachO::section Sect = getSection(Sec);
     Offset = Sect.offset;
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index bd03ac090721c..3f41618b18fcf 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -228,7 +228,7 @@ static cl::opt<bool> EnableLoopHeaderDuplication(
 static cl::opt<bool>
     EnableDFAJumpThreading("enable-dfa-jump-thread",
                            cl::desc("Enable DFA jump threading"),
-                           cl::init(false), cl::Hidden);
+                           cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
     EnableHotColdSplit("hot-cold-split",
diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp
index 1914f4cc39d96..d859abddbcad8 100644
--- a/llvm/lib/Support/BalancedPartitioning.cpp
+++ b/llvm/lib/Support/BalancedPartitioning.cpp
@@ -231,7 +231,7 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes,
   }
 
   // Compute move gains
-  typedef std::pair<float, BPFunctionNode *> GainPair;
+  using GainPair = std::pair<float, BPFunctionNode *>;
   std::vector<GainPair> Gains;
   for (auto &N : Nodes) {
     bool FromLeftToRight = (N.Bucket == LeftBucket);
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index de5bd795403dc..dab8beeff7ca5 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -2343,10 +2343,10 @@ namespace {
 class HelpPrinter {
 protected:
   const bool ShowHidden;
-  typedef SmallVector<std::pair<const char *, Option *>, 128>
-      StrOptionPairVector;
-  typedef SmallVector<std::pair<const char *, SubCommand *>, 128>
-      StrSubCommandPairVector;
+  using StrOptionPairVector =
+      SmallVector<std::pair<const char *, Option *>, 128>;
+  using StrSubCommandPairVector =
+      SmallVector<std::pair<const char *, SubCommand *>, 128>;
   // Print the options. Opts is assumed to be alphabetically sorted.
   virtual void printOptions(StrOptionPairVector &Opts, size_t MaxArgLen) {
     for (const auto &Opt : Opts)
diff --git a/llvm/lib/Support/DAGDeltaAlgorithm.cpp b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
index 981536473d124..3bfae147d18c0 100644
--- a/llvm/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/llvm/lib/Support/DAGDeltaAlgorithm.cpp
@@ -47,16 +47,16 @@ class DAGDeltaAlgorithmImpl {
   friend class DeltaActiveSetHelper;
 
 public:
-  typedef DAGDeltaAlgorithm::change_ty change_ty;
-  typedef DAGDeltaAlgorithm::changeset_ty changeset_ty;
-  typedef DAGDeltaAlgorithm::changesetlist_ty changesetlist_ty;
-  typedef DAGDeltaAlgorithm::edge_ty edge_ty;
+  using change_ty = DAGDeltaAlgorithm::change_ty;
+  using changeset_ty = DAGDeltaAlgorithm::changeset_ty;
+  using changesetlist_ty = DAGDeltaAlgorithm::changesetlist_ty;
+  using edge_ty = DAGDeltaAlgorithm::edge_ty;
 
 private:
-  typedef std::vector<change_ty>::iterator pred_iterator_ty;
-  typedef std::vector<change_ty>::iterator succ_iterator_ty;
-  typedef std::set<change_ty>::iterator pred_closure_iterator_ty;
-  typedef std::set<change_ty>::iterator succ_closure_iterator_ty;
+  using pred_iterator_ty = std::vector<change_ty>::iterator;
+  using succ_iterator_ty = std::vector<change_ty>::iterator;
+  using pred_closure_iterator_ty = std::set<change_ty>::iterator;
+  using succ_closure_iterator_ty = std::set<change_ty>::iterator;
 
   DAGDeltaAlgorithm &DDA;
 
diff --git a/llvm/lib/Support/DynamicLibrary.cpp b/llvm/lib/Support/DynamicLibrary.cpp
index f1c15c00cedea..61566d3722419 100644
--- a/llvm/lib/Support/DynamicLibrary.cpp
+++ b/llvm/lib/Support/DynamicLibrary.cpp
@@ -23,7 +23,7 @@ using namespace llvm::sys;
 
 // All methods for HandleSet should be used holding SymbolsMutex.
 class DynamicLibrary::HandleSet {
-  typedef std::vector<void *> HandleList;
+  using HandleList = std::vector<void *>;
   HandleList Handles;
   void *Process = &Invalid;
 
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 708e79d39cd21..6c140be59fc4b 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -34,6 +34,31 @@ static bool isContextFalsey(const json::Value *V) {
   return isFalsey(*V);
 }
 
+static void splitAndTrim(StringRef Str, SmallVectorImpl<StringRef> &Tokens) {
+  size_t CurrentPos = 0;
+  while (CurrentPos < Str.size()) {
+    // Find the next delimiter.
+    size_t DelimiterPos = Str.find('.', CurrentPos);
+
+    // If no delimiter is found, process the rest of the string.
+    if (DelimiterPos == StringRef::npos)
+      DelimiterPos = Str.size();
+
+    // Get the current part, which may have whitespace.
+    StringRef Part = Str.slice(CurrentPos, DelimiterPos);
+
+    // Manually trim the part without creating a new string object.
+    size_t Start = Part.find_first_not_of(" \t\r\n");
+    if (Start != StringRef::npos) {
+      size_t End = Part.find_last_not_of(" \t\r\n");
+      Tokens.push_back(Part.slice(Start, End + 1));
+    }
+
+    // Move past the delimiter for the next iteration.
+    CurrentPos = DelimiterPos + 1;
+  }
+}
+
 static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
   // We split the mustache string into an accessor.
   // For example:
@@ -46,13 +71,7 @@ static Accessor splitMustacheString(StringRef Str, MustacheContext &Ctx) {
     // It's a literal, so it doesn't need to be saved.
     Tokens.push_back(".");
   } else {
-    while (!Str.empty()) {
-      StringRef Part;
-      std::tie(Part, Str) = Str.split('.');
-      // Each part of the accessor needs to be saved to the arena
-      // to ensure it has a stable address.
-      Tokens.push_back(Ctx.Saver.save(Part.trim()));
-    }
+    splitAndTrim(Str, Tokens);
   }
   // Now, allocate memory for the array of StringRefs in the arena.
   StringRef *ArenaTokens = Ctx.Allocator.Allocate<StringRef>(Tokens.size());
@@ -368,141 +387,99 @@ struct Tag {
   llvm_unreachable("Unknown json::Value::Kind");
 }
 
-static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
-                       StringRef Close) {
-  const StringLiteral TripleOpen("{{{");
-  const StringLiteral TripleClose("}}}");
-
-  size_t NormalOpenPos = Template.find(Open, StartPos);
-  size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
-
-  Tag Result;
-
-  // Determine which tag comes first.
-  if (TripleOpenPos != StringRef::npos &&
-      (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
-    // Found a triple mustache tag.
-    size_t EndPos =
-        Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
-    if (EndPos == StringRef::npos)
-      return Result; // No closing tag found.
-
-    Result.TagKind = Tag::Kind::Triple;
-    Result.StartPosition = TripleOpenPos;
-    size_t ContentStart = TripleOpenPos + TripleOpen.size();
-    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
-    Result.FullMatch = Template.substr(
-        TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
-  } else if (NormalOpenPos != StringRef::npos) {
-    // Found a normal mustache tag.
-    size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
-    if (EndPos == StringRef::npos)
-      return Result; // No closing tag found.
-
-    Result.TagKind = Tag::Kind::Normal;
-    Result.StartPosition = NormalOpenPos;
-    size_t ContentStart = NormalOpenPos + Open.size();
-    Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
-    Result.FullMatch =
-        Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
-  }
-
-  return Result;
-}
-
-static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
-  LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
-                    << ", Kind: " << tagKindToString(T.TagKind) << "\n");
-  if (T.TagKind == Tag::Kind::Triple) {
-    Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
-    return std::nullopt;
-  }
-  StringRef Interpolated = T.Content;
-  if (!Interpolated.trim().starts_with("=")) {
-    char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
-    Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
-    return std::nullopt;
-  }
-  Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
-  StringRef DelimSpec = Interpolated.trim();
-  DelimSpec = DelimSpec.drop_front(1);
-  DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
-  DelimSpec = DelimSpec.trim();
-
-  std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
-  LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
-                    << ", NewClose: " << Ret.second << "\n");
-  return Ret;
-}
-
 // Simple tokenizer that splits the template into tokens.
-// The mustache spec allows {{{ }}} to unescape variables,
-// but we don't support that here. An unescape variable
-// is represented only by {{& variable}}.
 static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
   LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
   SmallVector<Token> Tokens;
   SmallString<8> Open("{{");
   SmallString<8> Close("}}");
-  size_t Start = 0;
+  size_t Cursor = 0;
+  size_t TextStart = 0;
 
-  while (Start < Template.size()) {
-    LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open
-                      << "', Close:'" << Close << "'\n");
-    Tag T = findNextTag(Template, Start, Open, Close);
+  const StringLiteral TripleOpen("{{{");
+  const StringLiteral TripleClose("}}}");
 
-    if (T.TagKind == Tag::Kind::None) {
-      // No more tags, the rest is text.
-      Tokens.emplace_back(Template.substr(Start));
-      break;
+  while (Cursor < Template.size()) {
+    StringRef TemplateSuffix = Template.substr(Cursor);
+    StringRef TagOpen, TagClose;
+    Tag::Kind Kind;
+
+    // Determine which tag we've encountered.
+    if (TemplateSuffix.starts_with(TripleOpen)) {
+      Kind = Tag::Kind::Triple;
+      TagOpen = TripleOpen;
+      TagClose = TripleClose;
+    } else if (TemplateSuffix.starts_with(Open)) {
+      Kind = Tag::Kind::Normal;
+      TagOpen = Open;
+      TagClose = Close;
+    } else {
+      // Not at a tag, continue scanning.
+      ++Cursor;
+      continue;
     }
 
-    // Add the text before the tag.
-    if (T.StartPosition > Start) {
-      StringRef Text = Template.substr(Start, T.StartPosition - Start);
-      Tokens.emplace_back(Text);
+    // Found a tag, first add the preceding text.
+    if (Cursor > TextStart)
+      Tokens.emplace_back(Template.slice(TextStart, Cursor));
+
+    // Find the closing tag.
+    size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
+    if (EndPos == StringRef::npos) {
+      // No closing tag, the rest is text.
+      Tokens.emplace_back(Template.substr(Cursor));
+      TextStart = Cursor = Template.size();
+      break;
     }
 
-    if (auto NewDelims = processTag(T, Tokens, Ctx)) {
-      std::tie(Open, Close) = *NewDelims;
+    // Extract tag content and full match.
+    size_t ContentStart = Cursor + TagOpen.size();
+    StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
+    StringRef FullMatch =
+        Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
+
+    // Process the tag (inlined logic from processTag).
+    LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
+                      << ", Kind: " << tagKindToString(Kind) << "\n");
+    if (Kind == Tag::Kind::Triple) {
+      Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
+    } else { // Normal Tag
+      StringRef Interpolated = Content;
+      if (!Interpolated.trim().starts_with("=")) {
+        char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+        Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
+      } else { // Set Delimiter
+        Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
+        StringRef DelimSpec = Interpolated.trim();
+        DelimSpec = DelimSpec.drop_front(1);
+        DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+        DelimSpec = DelimSpec.trim();
+
+        auto [NewOpen, NewClose] = DelimSpec.split(' ');
+        LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
+                          << ", NewClose: " << NewClose << "\n");
+        Open = NewOpen;
+        Close = NewClose;
+      }
     }
 
-    // Move past the tag.
-    Start = T.StartPosition + T.FullMatch.size();
+    // Move past the tag for the next iteration.
+    Cursor += FullMatch.size();
+    TextStart = Cursor;
   }
 
-  // Fix up white spaces for:
-  //   - open sections
-  //   - inverted sections
-  //   - close sections
-  //   - comments
-  //
-  // This loop attempts to find standalone tokens and tries to trim out
-  // the surrounding whitespace.
-  // For example:
-  // if you have the template string
-  //  {{#section}} \n Example \n{{/section}}
-  // The output should would be
-  // For example:
-  //  \n Example \n
+  // Add any remaining text after the last tag.
+  if (TextStart < Template.size())
+    Tokens.emplace_back(Template.substr(TextStart));
+
+  // Fix up white spaces for standalone tags.
   size_t LastIdx = Tokens.size() - 1;
   for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
     Token &CurrentToken = Tokens[Idx];
     Token::Type CurrentType = CurrentToken.getType();
-    // Check if token type requires cleanup.
-    bool RequiresCleanUp = requiresCleanUp(CurrentType);
-
-    if (!RequiresCleanUp)
+    if (!requiresCleanUp(CurrentType))
       continue;
 
-    // We adjust the token body if there's no text behind or ahead.
-    // A token is considered to have no text ahead if the right of the previous
-    // token is a newline followed by spaces.
-    // A token is considered to have no text behind if the left of the next
-    // token is spaces followed by a newline.
-    // eg.
-    //  "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
     bool HasTextBehind = hasTextBehind(Idx, Tokens);
     bool HasTextAhead = hasTextAhead(Idx, Tokens);
 
@@ -622,9 +599,16 @@ void Parser::parseSection(ASTNode *Parent, ASTNode::Type Ty,
   size_t Start = CurrentPtr;
   parseMustache(CurrentNode);
   const size_t End = CurrentPtr - 1;
+
+  size_t RawBodySize = 0;
+  for (size_t I = Start; I < End; ++I)
+    RawBodySize += Tokens[I].RawBody.size();
+
   SmallString<128> RawBody;
-  for (std::size_t I = Start; I < End; I++)
+  RawBody.reserve(RawBodySize);
+  for (std::size_t I = Start; I < End; ++I)
     RawBody += Tokens[I].RawBody;
+
   CurrentNode->setRawBody(Ctx.Saver.save(StringRef(RawBody)));
   Parent->addChild(CurrentNode);
 }
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 9d45096dddd97..b08f5083e00a8 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -207,7 +207,7 @@ void TimeRecord::print(const TimeRecord &Total, raw_ostream &OS) const {
 
 namespace {
 
-typedef StringMap<Timer> Name2TimerMap;
+using Name2TimerMap = StringMap<Timer>;
 
 class Name2PairMap {
   StringMap<std::pair<TimerGroup*, Name2TimerMap> > Map;
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index afce803f3f568..8ad20b45f5e16 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -46,12 +46,11 @@ using namespace llvm;
 //    Context
 //===----------------------------------------------------------------------===//
 
-namespace llvm::detail {
 /// This class represents the internal implementation of the RecordKeeper.
 /// It contains all of the contextual static state of the Record classes. It is
 /// kept out-of-line to simplify dependencies, and also make it easier for
 /// internal classes to access the uniquer state of the keeper.
-struct RecordKeeperImpl {
+struct detail::RecordKeeperImpl {
   RecordKeeperImpl(RecordKeeper &RK)
       : SharedBitRecTy(RK), SharedIntRecTy(RK), SharedStringRecTy(RK),
         SharedDagRecTy(RK), AnyRecord(RK, {}), TheUnsetInit(RK),
@@ -99,7 +98,6 @@ struct RecordKeeperImpl {
 
   void dumpAllocationStats(raw_ostream &OS) const;
 };
-} // namespace llvm::detail
 
 void detail::RecordKeeperImpl::dumpAllocationStats(raw_ostream &OS) const {
   // Dump memory allocation related stats.
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
index 1169f26a2ae37..97298f9d74171 100644
--- a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -655,16 +655,10 @@ Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit);
   IRBuilder<> B(BB);
 
-  // Load the global symbol as a pointer to the check function.
-  Value *GuardFn;
-  if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf"))
-    GuardFn = GuardFnCFGlobal;
-  else
-    GuardFn = GuardFnGlobal;
-  LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFn);
-
-  // Create new call instruction. The CFGuard check should always be a call,
-  // even if the original CallBase is an Invoke or CallBr instruction.
+  // Create new call instruction. The call check should always be a call,
+  // even if the original CallBase is an Invoke or CallBr instructio.
+  // This is treated as a direct call, so do not use GuardFnCFGlobal.
+  LoadInst *GuardCheckLoad = B.CreateLoad(PtrTy, GuardFnGlobal);
   Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes());
   CallInst *GuardCheck = B.CreateCall(
       GuardFnType, GuardCheckLoad, {F, Thunk});
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index 1b5a713bffdc9..34c85d588f9c4 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -601,6 +601,12 @@ def CSR_Win_AArch64_AAPCS_SwiftError
 def CSR_Win_AArch64_AAPCS_SwiftTail
     : CalleeSavedRegs<(sub CSR_Win_AArch64_AAPCS, X20, X22)>;
 
+def CSR_Win_AArch64_RT_MostRegs
+    : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, (sequence "X%u", 9, 15))>;
+
+def CSR_Win_AArch64_RT_AllRegs
+    : CalleeSavedRegs<(add CSR_Win_AArch64_RT_MostRegs, (sequence "Q%u", 8, 31))>;
+
 // The Control Flow Guard check call uses a custom calling convention that also
 // preserves X0-X8 and Q0-Q7.
 def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS,
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index cf344980cbaae..18e246e5af57d 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -81,10 +81,7 @@ namespace {
 class AArch64FastISel final : public FastISel {
   class Address {
   public:
-    using BaseKind = enum {
-      RegBase,
-      FrameIndexBase
-    };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0f7b34c36055f..3ee4d58ca892c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2380,13 +2380,6 @@ void AArch64FrameLowering::determineStackHazardSlot(
       return;
     }
 
-    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-    if (MFI.hasVarSizedObjects() || TRI->hasStackRealignment(MF)) {
-      LLVM_DEBUG(dbgs() << "SplitSVEObjects is not supported with variable "
-                           "sized objects or realignment\n");
-      return;
-    }
-
     // If another calling convention is explicitly set FPRs can't be promoted to
     // ZPR callee-saves.
     if (!is_contained({CallingConv::C, CallingConv::Fast,
@@ -2402,6 +2395,7 @@ void AArch64FrameLowering::determineStackHazardSlot(
     assert(Subtarget.isSVEorStreamingSVEAvailable() &&
            "Expected SVE to be available for PPRs");
 
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     // With SplitSVEObjects the CS hazard padding is placed between the
     // PPRs and ZPRs. If there are any FPR CS there would be a hazard between
     // them and the CS GRPs. Avoid this by promoting all FPR CS to ZPRs.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60aa61e993b26..d08f9b94227a2 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1052,15 +1052,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   // Lower READCYCLECOUNTER using an mrs from CNTVCT_EL0.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
 
-  if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
-      getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
-    // Issue __sincos_stret if available.
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  }
+  // Issue __sincos_stret if available.
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // Make floating-point constants legal for the large code model, so they don't
   // become loads from the constant pool.
@@ -5346,35 +5340,6 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   return SDValue();
 }
 
-SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values in two S / D registers.
-  SDLoc DL(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  ArgListTy Args;
-  Args.emplace_back(Arg, ArgTy);
-
-  RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
-                                        : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = getLibcallName(LC);
-  SDValue Callee =
-      DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
-
-  StructType *RetTy = StructType::get(ArgTy, ArgTy);
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CallingConv::ID CC = getLibcallCallingConv(LC);
-  CLI.setDebugLoc(DL)
-      .setChain(DAG.getEntryNode())
-      .setLibCallee(CC, RetTy, Callee, std::move(Args));
-
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.first;
-}
-
 static MVT getSVEContainerType(EVT ContentTy);
 
 SDValue
@@ -7723,8 +7688,6 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT:
     return LowerFP_TO_INT_SAT(Op, DAG);
-  case ISD::FSINCOS:
-    return LowerFSINCOS(Op, DAG);
   case ISD::GET_ROUNDING:
     return LowerGET_ROUNDING(Op, DAG);
   case ISD::SET_ROUNDING:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 2cb8ed29f252a..70bfae717fb76 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -745,7 +745,6 @@ class AArch64TargetLowering : public TargetLowering {
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index e69fa32967a79..2ab7bf19da410 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1386,6 +1386,25 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
       if (MOP.isReg() && MOP.isKill())
         DefinedInBB.addReg(MOP.getReg());
 
+  // Copy over any implicit-def operands. This is like MI.copyImplicitOps, but
+  // only copies implicit defs and makes sure that each operand is only added
+  // once in case of duplicates.
+  auto CopyImplicitOps = [&](MachineBasicBlock::iterator MI1,
+                             MachineBasicBlock::iterator MI2) {
+    SmallSetVector<Register, 4> Ops;
+    for (const MachineOperand &MO :
+         llvm::drop_begin(MI1->operands(), MI1->getDesc().getNumOperands()))
+      if (MO.isReg() && MO.isImplicit() && MO.isDef())
+        Ops.insert(MO.getReg());
+    for (const MachineOperand &MO :
+         llvm::drop_begin(MI2->operands(), MI2->getDesc().getNumOperands()))
+      if (MO.isReg() && MO.isImplicit() && MO.isDef())
+        Ops.insert(MO.getReg());
+    for (auto Op : Ops)
+      MIB.addDef(Op, RegState::Implicit);
+  };
+  CopyImplicitOps(I, Paired);
+
   // Erase the old instructions.
   I->eraseFromParent();
   Paired->eraseFromParent();
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
index 45b7120112af2..4df4d54e60c95 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.cpp
@@ -805,7 +805,7 @@ void AArch64PrologueEmitter::emitPrologue() {
     CFAOffset += SVEAllocs.BeforePPRs;
     assert(PPRRange.End == ZPRRange.Begin &&
            "Expected ZPR callee saves after PPR locals");
-    allocateStackSpace(PPRRange.End, RealignmentPadding, SVEAllocs.AfterPPRs,
+    allocateStackSpace(PPRRange.End, 0, SVEAllocs.AfterPPRs,
                        EmitAsyncCFI && !HasFP, CFAOffset,
                        MFI.hasVarSizedObjects() || SVEAllocs.AfterZPRs);
     CFAOffset += SVEAllocs.AfterPPRs;
@@ -1318,6 +1318,26 @@ AArch64EpilogueEmitter::AArch64EpilogueEmitter(MachineFunction &MF,
   SEHEpilogueStartI = MBB.end();
 }
 
+void AArch64EpilogueEmitter::moveSPBelowFP(MachineBasicBlock::iterator MBBI,
+                                           StackOffset Offset) {
+  // Other combinations could be supported, but are not currently needed.
+  assert(Offset.getScalable() < 0 && Offset.getFixed() <= 0 &&
+         "expected negative offset (with optional fixed portion)");
+  Register Base = AArch64::FP;
+  if (int64_t FixedOffset = Offset.getFixed()) {
+    // If we have a negative fixed offset, we need to first subtract it in a
+    // temporary register first (to avoid briefly deallocating the scalable
+    // portion of the offset).
+    Base = MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+    emitFrameOffset(MBB, MBBI, DL, Base, AArch64::FP,
+                    StackOffset::getFixed(FixedOffset), TII,
+                    MachineInstr::FrameDestroy);
+  }
+  emitFrameOffset(MBB, MBBI, DL, AArch64::SP, Base,
+                  StackOffset::getScalable(Offset.getScalable()), TII,
+                  MachineInstr::FrameDestroy);
+}
+
 void AArch64EpilogueEmitter::emitEpilogue() {
   MachineBasicBlock::iterator EpilogueEndI = MBB.getLastNonDebugInstr();
   if (MBB.end() != EpilogueEndI) {
@@ -1418,6 +1438,7 @@ void AArch64EpilogueEmitter::emitEpilogue() {
       AfterCSRPopSize += ProloguePopSize;
     }
   }
+
   // Move past the restores of the callee-saved registers.
   // If we plan on combining the sp bump of the local stack size and the callee
   // save stack size, we might need to adjust the CSR save and restore offsets.
@@ -1483,7 +1504,6 @@ void AArch64EpilogueEmitter::emitEpilogue() {
 
   StackOffset SVECalleeSavesSize = ZPR.CalleeSavesSize + PPR.CalleeSavesSize;
   SVEStackAllocations SVEAllocs = getSVEStackAllocations({PPR, ZPR});
-  MachineBasicBlock::iterator RestoreBegin = ZPRRange.Begin;
 
   // Deallocate the SVE area.
   if (SVELayout == SVEStackLayout::CalleeSavesAboveFrameRecord) {
@@ -1510,28 +1530,25 @@ void AArch64EpilogueEmitter::emitEpilogue() {
         (AFI->isStackRealigned() || MFI.hasVarSizedObjects()) ? AArch64::FP
                                                               : AArch64::SP;
     if (SVECalleeSavesSize && BaseForSVEDealloc == AArch64::FP) {
-      // TODO: Support stack realigment and variable-sized objects.
-      assert(
-          SVELayout != SVEStackLayout::Split &&
-          "unexpected stack realignment or variable sized objects with split "
-          "SVE stack objects");
-
-      Register CalleeSaveBase = AArch64::FP;
-      if (int64_t CalleeSaveBaseOffset =
-              AFI->getCalleeSaveBaseToFrameRecordOffset()) {
-        // If we have have an non-zero offset to the non-SVE CS base we need to
-        // compute the base address by subtracting the offest in a temporary
-        // register first (to avoid briefly deallocating the SVE CS).
-        CalleeSaveBase = MBB.getParent()->getRegInfo().createVirtualRegister(
-            &AArch64::GPR64RegClass);
-        emitFrameOffset(MBB, RestoreBegin, DL, CalleeSaveBase, AArch64::FP,
-                        StackOffset::getFixed(-CalleeSaveBaseOffset), TII,
-                        MachineInstr::FrameDestroy);
+      if (ZPR.CalleeSavesSize || SVELayout != SVEStackLayout::Split) {
+        // The offset from the frame-pointer to the start of the ZPR saves.
+        StackOffset FPOffsetZPR =
+            -SVECalleeSavesSize - PPR.LocalsSize -
+            StackOffset::getFixed(AFI->getCalleeSaveBaseToFrameRecordOffset());
+        // Deallocate the stack space space by moving the SP to the start of the
+        // ZPR/PPR callee-save area.
+        moveSPBelowFP(ZPRRange.Begin, FPOffsetZPR);
+      }
+      // With split SVE, the predicates are stored in a separate area above the
+      // ZPR saves, so we must adjust the stack to the start of the PPRs.
+      if (PPR.CalleeSavesSize && SVELayout == SVEStackLayout::Split) {
+        // The offset from the frame-pointer to the start of the PPR saves.
+        StackOffset FPOffsetPPR = -PPR.CalleeSavesSize;
+        // Move to the start of the PPR area.
+        assert(!FPOffsetPPR.getFixed() && "expected only scalable offset");
+        emitFrameOffset(MBB, ZPRRange.End, DL, AArch64::SP, AArch64::FP,
+                        FPOffsetPPR, TII, MachineInstr::FrameDestroy);
       }
-      // The code below will deallocate the stack space space by moving the SP
-      // to the start of the SVE callee-save area.
-      emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, CalleeSaveBase,
-                      -SVECalleeSavesSize, TII, MachineInstr::FrameDestroy);
     } else if (BaseForSVEDealloc == AArch64::SP) {
       auto NonSVELocals = StackOffset::getFixed(NumBytes);
       auto CFAOffset = NonSVELocals + StackOffset::getFixed(PrologueSaveSize) +
diff --git a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
index 6e0e28324a0ac..7f297b5d337b0 100644
--- a/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
+++ b/llvm/lib/Target/AArch64/AArch64PrologueEpilogue.h
@@ -180,6 +180,10 @@ class AArch64EpilogueEmitter final : public AArch64PrologueEpilogueCommon {
 private:
   bool shouldCombineCSRLocalStackBump(uint64_t StackBumpBytes) const;
 
+  /// A helper for moving the SP to a negative offset from the FP, without
+  /// deallocating any stack in the range FP to FP + Offset.
+  void moveSPBelowFP(MachineBasicBlock::iterator MBBI, StackOffset Offset);
+
   void emitSwiftAsyncContextFramePointer(MachineBasicBlock::iterator MBBI,
                                          const DebugLoc &DL) const;
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 5bfb19d9a7e61..a5048b9c9e61d 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -90,6 +90,16 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
     return getDarwinCalleeSavedRegs(MF);
 
+  if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+    return MF->getSubtarget<AArch64Subtarget>().isTargetWindows()
+               ? CSR_Win_AArch64_RT_MostRegs_SaveList
+               : CSR_AArch64_RT_MostRegs_SaveList;
+
+  if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll)
+    return MF->getSubtarget<AArch64Subtarget>().isTargetWindows()
+               ? CSR_Win_AArch64_RT_AllRegs_SaveList
+               : CSR_AArch64_RT_AllRegs_SaveList;
+
   if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
     return CSR_Win_AArch64_CFGuard_Check_SaveList;
   if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) {
@@ -138,10 +148,6 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_AAPCS_SwiftError_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::SwiftTail)
     return CSR_AArch64_AAPCS_SwiftTail_SaveList;
-  if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
-    return CSR_AArch64_RT_MostRegs_SaveList;
-  if (MF->getFunction().getCallingConv() == CallingConv::PreserveAll)
-    return CSR_AArch64_RT_AllRegs_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::Win64)
     // This is for OSes other than Windows; Windows is a separate case further
     // above.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5b5565afd62b1..197aae6e03cb1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3007,9 +3007,9 @@ AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   llvm_unreachable("Unsupported register kind");
 }
 
-bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
-                                           ArrayRef<const Value *> Args,
-                                           Type *SrcOverrideTy) const {
+bool AArch64TTIImpl::isSingleExtWideningInstruction(
+    unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
+    Type *SrcOverrideTy) const {
   // A helper that returns a vector type from the given type. The number of
   // elements in type Ty determines the vector width.
   auto toVectorTy = [&](Type *ArgTy) {
@@ -3027,48 +3027,29 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
       (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
     return false;
 
-  // Determine if the operation has a widening variant. We consider both the
-  // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
-  // instructions.
-  //
-  // TODO: Add additional widening operations (e.g., shl, etc.) once we
-  //       verify that their extending operands are eliminated during code
-  //       generation.
   Type *SrcTy = SrcOverrideTy;
   switch (Opcode) {
-  case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
-  case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
+  case Instruction::Add:   // UADDW(2), SADDW(2).
+  case Instruction::Sub: { // USUBW(2), SSUBW(2).
     // The second operand needs to be an extend
     if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
       if (!SrcTy)
         SrcTy =
             toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
-    } else
+      break;
+    }
+
+    if (Opcode == Instruction::Sub)
       return false;
-    break;
-  case Instruction::Mul: { // SMULL(2), UMULL(2)
-    // Both operands need to be extends of the same type.
-    if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
-        (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+
+    // UADDW(2), SADDW(2) can be commutted.
+    if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
       if (!SrcTy)
         SrcTy =
             toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
-    } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
-      // If one of the operands is a Zext and the other has enough zero bits to
-      // be treated as unsigned, we can still general a umull, meaning the zext
-      // is free.
-      KnownBits Known =
-          computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
-      if (Args[0]->getType()->getScalarSizeInBits() -
-              Known.Zero.countLeadingOnes() >
-          DstTy->getScalarSizeInBits() / 2)
-        return false;
-      if (!SrcTy)
-        SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
-                                           DstTy->getScalarSizeInBits() / 2));
-    } else
-      return false;
-    break;
+      break;
+    }
+    return false;
   }
   default:
     return false;
@@ -3099,6 +3080,73 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
 }
 
+Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                                  ArrayRef<const Value *> Args,
+                                                  Type *SrcOverrideTy) const {
+  if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
+      Opcode != Instruction::Mul)
+    return nullptr;
+
+  // Exit early if DstTy is not a vector type whose elements are one of [i16,
+  // i32, i64]. SVE doesn't generally have the same set of instructions to
+  // perform an extend with the add/sub/mul. There are SMULLB style
+  // instructions, but they operate on top/bottom, requiring some sort of lane
+  // interleaving to be used with zext/sext.
+  unsigned DstEltSize = DstTy->getScalarSizeInBits();
+  if (!useNeonVector(DstTy) || Args.size() != 2 ||
+      (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
+    return nullptr;
+
+  auto getScalarSizeWithOverride = [&](const Value *V) {
+    if (SrcOverrideTy)
+      return SrcOverrideTy->getScalarSizeInBits();
+    return cast<Instruction>(V)
+        ->getOperand(0)
+        ->getType()
+        ->getScalarSizeInBits();
+  };
+
+  unsigned MaxEltSize = 0;
+  if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
+      (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
+    unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+    unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+    MaxEltSize = std::max(EltSize0, EltSize1);
+  } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
+             isa<SExtInst, ZExtInst>(Args[1])) {
+    unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
+    unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
+    // mul(sext, zext) will become smull(sext, zext) if the extends are large
+    // enough.
+    if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
+      return nullptr;
+    MaxEltSize = DstEltSize / 2;
+  } else if (Opcode == Instruction::Mul &&
+             (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
+    // If one of the operands is a Zext and the other has enough zero bits
+    // to be treated as unsigned, we can still generate a umull, meaning the
+    // zext is free.
+    KnownBits Known =
+        computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
+    if (Args[0]->getType()->getScalarSizeInBits() -
+            Known.Zero.countLeadingOnes() >
+        DstTy->getScalarSizeInBits() / 2)
+      return nullptr;
+
+    MaxEltSize =
+        getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
+  } else
+    return nullptr;
+
+  if (MaxEltSize * 2 > DstEltSize)
+    return nullptr;
+
+  Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
+  if (ExtTy->getPrimitiveSizeInBits() <= 64)
+    return nullptr;
+  return ExtTy;
+}
+
 // s/urhadd instructions implement the following pattern, making the
 // extends free:
 //   %x = add ((zext i8 -> i16), 1)
@@ -3159,7 +3207,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (I && I->hasOneUser()) {
     auto *SingleUser = cast<Instruction>(*I->user_begin());
     SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
-    if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
+    if (Type *ExtTy = isBinExtWideningInstruction(
+            SingleUser->getOpcode(), Dst, Operands,
+            Src != I->getOperand(0)->getType() ? Src : nullptr)) {
+      // The cost from Src->Src*2 needs to be added if required, the cost from
+      // Src*2->ExtTy is free.
+      if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
+        Type *DoubleSrcTy =
+            Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
+        return getCastInstrCost(Opcode, DoubleSrcTy, Src,
+                                TTI::CastContextHint::None, CostKind);
+      }
+
+      return 0;
+    }
+
+    if (isSingleExtWideningInstruction(
+            SingleUser->getOpcode(), Dst, Operands,
+            Src != I->getOperand(0)->getType() ? Src : nullptr)) {
       // For adds only count the second operand as free if both operands are
       // extends but not the same operation. (i.e both operands are not free in
       // add(sext, zext)).
@@ -3168,8 +3233,11 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
             (isa<CastInst>(SingleUser->getOperand(1)) &&
              cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
           return 0;
-      } else // Others are free so long as isWideningInstruction returned true.
+      } else {
+        // Others are free so long as isSingleExtWideningInstruction
+        // returned true.
         return 0;
+      }
     }
 
     // The cast will be free for the s/urhadd instructions
@@ -4148,6 +4216,18 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
             }))
       return *PromotedCost;
 
+  // If the operation is a widening instruction (smull or umull) and both
+  // operands are extends the cost can be cheaper by considering that the
+  // operation will operate on the narrowest type size possible (double the
+  // largest input size) and a further extend.
+  if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
+    if (ExtTy != Ty)
+      return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
+             getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
+                              TTI::CastContextHint::None, CostKind);
+    return LT.first;
+  }
+
   switch (ISD) {
   default:
     return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
@@ -4381,10 +4461,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
     // - two 2-cost i64 inserts, and
     // - two 1-cost muls.
     // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
-    // LT.first = 2 the cost is 28. If both operands are extensions it will not
-    // need to scalarize so the cost can be cheaper (smull or umull).
-    // so the cost can be cheaper (smull or umull).
-    if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
+    // LT.first = 2 the cost is 28.
+    if (LT.second != MVT::v2i64)
       return LT.first;
     return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
            (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
@@ -6129,7 +6207,8 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy,
 }
 
 static bool containsDecreasingPointers(Loop *TheLoop,
-                                       PredicatedScalarEvolution *PSE) {
+                                       PredicatedScalarEvolution *PSE,
+                                       const DominatorTree &DT) {
   const auto &Strides = DenseMap<Value *, const SCEV *>();
   for (BasicBlock *BB : TheLoop->blocks()) {
     // Scan the instructions in the block and look for addresses that are
@@ -6138,8 +6217,8 @@ static bool containsDecreasingPointers(Loop *TheLoop,
       if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
         Value *Ptr = getLoadStorePointerOperand(&I);
         Type *AccessTy = getLoadStoreType(&I);
-        if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
-                         /*ShouldCheckWrap=*/false)
+        if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
+                         /*Assume=*/true, /*ShouldCheckWrap=*/false)
                 .value_or(0) < 0)
           return true;
       }
@@ -6184,7 +6263,8 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
   // negative strides. This will require extra work to reverse the loop
   // predicate, which may be expensive.
   if (containsDecreasingPointers(TFI->LVL->getLoop(),
-                                 TFI->LVL->getPredicatedScalarEvolution()))
+                                 TFI->LVL->getPredicatedScalarEvolution(),
+                                 *TFI->LVL->getDominatorTree()))
     Required |= TailFoldingOpts::Reverse;
   if (Required == TailFoldingOpts::Disabled)
     Required |= TailFoldingOpts::Simple;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index b39546a9a381d..e3b0a1bec53ec 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -59,9 +59,17 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
     VECTOR_LDST_FOUR_ELEMENTS
   };
 
-  bool isWideningInstruction(Type *DstTy, unsigned Opcode,
-                             ArrayRef<const Value *> Args,
-                             Type *SrcOverrideTy = nullptr) const;
+  /// Given a add/sub/mul operation, detect a widening addl/subl/mull pattern
+  /// where both operands can be treated like extends. Returns the minimal type
+  /// needed to compute the operation.
+  Type *isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                    ArrayRef<const Value *> Args,
+                                    Type *SrcOverrideTy = nullptr) const;
+  /// Given a add/sub operation with a single extend operand, detect a
+  /// widening addw/subw pattern.
+  bool isSingleExtWideningInstruction(unsigned Opcode, Type *DstTy,
+                                      ArrayRef<const Value *> Args,
+                                      Type *SrcOverrideTy = nullptr) const;
 
   // A helper function called by 'getVectorInstrCost'.
   //
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 8669978637f40..56ab040706a13 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -38,9 +38,10 @@ enum ImplicitArgumentPositions {
 #define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
 
 enum ImplicitArgumentMask {
-  NOT_IMPLICIT_INPUT = 0,
+  UNKNOWN_INTRINSIC = 0,
 #include "AMDGPUAttributes.def"
-  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1
+  ALL_ARGUMENT_MASK = (1 << LAST_ARG_POS) - 1,
+  NOT_IMPLICIT_INPUT
 };
 
 #define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
@@ -115,7 +116,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
     NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
     return QUEUE_PTR;
   default:
-    return NOT_IMPLICIT_INPUT;
+    return UNKNOWN_INTRINSIC;
   }
 }
 
@@ -534,6 +535,21 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       ImplicitArgumentMask AttrMask =
           intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
                               HasApertureRegs, SupportsGetDoorbellID, COV);
+
+      if (AttrMask == UNKNOWN_INTRINSIC) {
+        // Assume not-nocallback intrinsics may invoke a function which accesses
+        // implicit arguments.
+        //
+        // FIXME: This isn't really the correct check. We want to ensure it
+        // isn't calling any function that may use implicit arguments regardless
+        // of whether it's internal to the module or not.
+        //
+        // TODO: Ignoring callsite attributes.
+        if (!Callee->hasFnAttribute(Attribute::NoCallback))
+          return indicatePessimisticFixpoint();
+        continue;
+      }
+
       if (AttrMask != NOT_IMPLICIT_INPUT) {
         if ((IsNonEntryFunc || !NonKernelOnly))
           removeAssumedBits(AttrMask);
@@ -1357,7 +1373,10 @@ struct AAAMDGPUMinAGPRAlloc
       default:
         // Some intrinsics may use AGPRs, but if we have a choice, we are not
         // required to use AGPRs.
-        return true;
+
+        // Assume !nocallback intrinsics may call a function which requires
+        // AGPRs.
+        return CB.hasFnAttr(Attribute::NoCallback);
       }
 
       // TODO: Handle callsite attributes
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 0c977416f1793..15ed60b46a9c0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -116,8 +116,14 @@ bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
   if (!DstRC || DstRC != SrcRC)
     return false;
 
-  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
-         RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+  if (!RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) ||
+      !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
+    return false;
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+    MI.getOperand(0).setIsEarlyClobber(true);
+  }
+  return true;
 }
 
 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
@@ -602,6 +608,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
   I.setDesc(TII.get(Opc));
   I.addOperand(*MF, MachineOperand::CreateImm(0));
   I.addImplicitDefUseOperands(*MF);
+  I.getOperand(0).setIsEarlyClobber(true);
   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
 }
 
@@ -3787,6 +3794,10 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
   MI.removeOperand(1); // Intrinsic ID
   MI.addOperand(VDst_In); // Readd VDst_In to the end
   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (MCID.getOperandConstraint(0, MCOI::EARLY_CLOBBER) != -1) {
+    MI.getOperand(0).setIsEarlyClobber(true);
+  }
   return true;
 }
 
@@ -6753,7 +6764,7 @@ bool AMDGPUInstructionSelector::selectSGetBarrierState(
     MachineInstr &I, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  MachineOperand BarOp = I.getOperand(2);
+  const MachineOperand &BarOp = I.getOperand(2);
   std::optional<int64_t> BarValImm =
       getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
 
@@ -6806,8 +6817,8 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInit(
     MachineInstr &I, Intrinsic::ID IntrID) const {
   MachineBasicBlock *MBB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  MachineOperand BarOp = I.getOperand(1);
-  MachineOperand CntOp = I.getOperand(2);
+  const MachineOperand &BarOp = I.getOperand(1);
+  const MachineOperand &CntOp = I.getOperand(2);
 
   // BarID = (BarOp >> 4) & 0x3F
   Register TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index dd474ac52c3c8..1e5885a25c195 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -913,6 +913,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
 
+  addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
+
   addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
       .Uni(S64, {{Sgpr64}, {}});
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 733c5d520fb23..fe81a5efd9d51 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -181,14 +181,52 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
   return NewRetBlock;
 }
 
+static BasicBlock *
+createDummyReturnBlock(Function &F,
+                       SmallVector<BasicBlock *, 4> &ReturningBlocks) {
+  BasicBlock *DummyReturnBB =
+      BasicBlock::Create(F.getContext(), "DummyReturnBlock", &F);
+  Type *RetTy = F.getReturnType();
+  Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
+  ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
+  ReturningBlocks.push_back(DummyReturnBB);
+  return DummyReturnBB;
+}
+
+/// Handle conditional branch instructions (-> 2 targets) and callbr
+/// instructions with N targets.
+static void handleNBranch(Function &F, BasicBlock *BB, Instruction *BI,
+                          BasicBlock *DummyReturnBB,
+                          std::vector<DominatorTree::UpdateType> &Updates) {
+  SmallVector<BasicBlock *, 2> Successors(successors(BB));
+
+  // Create a new transition block to hold the conditional branch.
+  BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
+
+  Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
+
+  // 'Successors' become successors of TransitionBB instead of BB,
+  // and TransitionBB becomes a single successor of BB.
+  Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
+  for (BasicBlock *Successor : Successors) {
+    Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
+    Updates.emplace_back(DominatorTree::Delete, BB, Successor);
+  }
+
+  // Create a branch that will always branch to the transition block and
+  // references DummyReturnBB.
+  BB->getTerminator()->eraseFromParent();
+  BranchInst::Create(TransitionBB, DummyReturnBB,
+                     ConstantInt::getTrue(F.getContext()), BB);
+  Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+}
+
 bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
                                             const PostDominatorTree &PDT,
                                             const UniformityInfo &UA) {
-  assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
-
   if (PDT.root_size() == 0 ||
       (PDT.root_size() == 1 &&
-       !isa<BranchInst>(PDT.getRoot()->getTerminator())))
+       !isa<BranchInst, CallBrInst>(PDT.getRoot()->getTerminator())))
     return false;
 
   // Loop over all of the blocks in a function, tracking all of the blocks that
@@ -222,46 +260,28 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
       if (HasDivergentExitBlock)
         UnreachableBlocks.push_back(BB);
     } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-
-      ConstantInt *BoolTrue = ConstantInt::getTrue(F.getContext());
-      if (DummyReturnBB == nullptr) {
-        DummyReturnBB = BasicBlock::Create(F.getContext(),
-                                           "DummyReturnBlock", &F);
-        Type *RetTy = F.getReturnType();
-        Value *RetVal = RetTy->isVoidTy() ? nullptr : PoisonValue::get(RetTy);
-        ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
-        ReturningBlocks.push_back(DummyReturnBB);
-      }
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
 
       if (BI->isUnconditional()) {
         BasicBlock *LoopHeaderBB = BI->getSuccessor(0);
         BI->eraseFromParent(); // Delete the unconditional branch.
         // Add a new conditional branch with a dummy edge to the return block.
-        BranchInst::Create(LoopHeaderBB, DummyReturnBB, BoolTrue, BB);
-        Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
-      } else { // Conditional branch.
-        SmallVector<BasicBlock *, 2> Successors(successors(BB));
-
-        // Create a new transition block to hold the conditional branch.
-        BasicBlock *TransitionBB = BB->splitBasicBlock(BI, "TransitionBlock");
-
-        Updates.reserve(Updates.size() + 2 * Successors.size() + 2);
-
-        // 'Successors' become successors of TransitionBB instead of BB,
-        // and TransitionBB becomes a single successor of BB.
-        Updates.emplace_back(DominatorTree::Insert, BB, TransitionBB);
-        for (BasicBlock *Successor : Successors) {
-          Updates.emplace_back(DominatorTree::Insert, TransitionBB, Successor);
-          Updates.emplace_back(DominatorTree::Delete, BB, Successor);
-        }
-
-        // Create a branch that will always branch to the transition block and
-        // references DummyReturnBB.
-        BB->getTerminator()->eraseFromParent();
-        BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
+        BranchInst::Create(LoopHeaderBB, DummyReturnBB,
+                           ConstantInt::getTrue(F.getContext()), BB);
         Updates.emplace_back(DominatorTree::Insert, BB, DummyReturnBB);
+      } else {
+        handleNBranch(F, BB, BI, DummyReturnBB, Updates);
       }
       Changed = true;
+    } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(BB->getTerminator())) {
+      if (!DummyReturnBB)
+        DummyReturnBB = createDummyReturnBlock(F, ReturningBlocks);
+
+      handleNBranch(F, BB, CBI, DummyReturnBB, Updates);
+      Changed = true;
+    } else {
+      llvm_unreachable("unsupported block terminator");
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 52cc4ca5a955c..1a14629fb66b3 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -435,7 +435,7 @@ void GCNHazardRecognizer::RecedeCycle() {
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-using HazardFnResult = enum { HazardFound, HazardExpired, NoHazardFound };
+enum HazardFnResult { HazardFound, HazardExpired, NoHazardFound };
 
 using IsExpiredFn = function_ref<bool(const MachineInstr &, int WaitStates)>;
 using GetNumWaitStatesFn = function_ref<unsigned int(const MachineInstr &)>;
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 959ce6904ce4d..1682abbdea169 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -43,7 +43,7 @@ class GCNNSAReassignImpl {
   bool run(MachineFunction &MF);
 
 private:
-  using NSA_Status = enum {
+  enum NSA_Status {
     NOT_NSA,        // Not an NSA instruction
     FIXED,          // NSA which we cannot modify
     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index 4deb2a9485e4d..62172a0bb89db 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -136,7 +136,7 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
           continue;
 
         if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
-          MachineOperand DefSrcMO = Def.getOperand(1);
+          const MachineOperand &DefSrcMO = Def.getOperand(1);
 
           // Immediates are not an issue and can be propagated in
           // postrapseudos pass. Only handle cases where defining
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 2aa54c920a046..31eca049fd149 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1129,12 +1129,9 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && TruncatingStore) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
-      // TODO: can the chain be replaced without creating a new store?
-      SDValue NewStore = DAG.getTruncStore(
-          NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(), MemVT,
-          StoreNode->getAlign(), StoreNode->getMemOperand()->getFlags(),
-          StoreNode->getAAInfo());
-      StoreNode = cast<StoreSDNode>(NewStore);
+      SmallVector<SDValue, 4> NewOps(StoreNode->ops());
+      NewOps[0] = NewChain;
+      StoreNode = cast<StoreSDNode>(DAG.UpdateNodeOperands(StoreNode, NewOps));
     }
 
     return scalarizeVectorStore(StoreNode, DAG);
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 5c39f7a3d6daa..aa5ea77f17291 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -2170,7 +2170,9 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
     return MFI.getStackSize() != 0;
   }
 
-  return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
+  return (frameTriviallyRequiresSP(MFI) &&
+          !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) ||
+         MFI.isFrameAddressTaken() ||
          MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
              MF) ||
          mayReserveScratchForCWSR(MF) ||
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 45f591927b86e..9460145d47111 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7945,7 +7945,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     }
     legalizeOperands(*NewInstr, MDT);
     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+    const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
     Inst.eraseFromParent();
     return;
@@ -7985,7 +7985,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     legalizeOperandsVALUt16(*NewInstr, MRI);
     legalizeOperands(*NewInstr, MDT);
     int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
-    MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+    const MachineOperand &SCCOp = Inst.getOperand(SCCIdx);
     addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
     Inst.eraseFromParent();
     return;
@@ -8183,7 +8183,7 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
                                    AMDGPU::OpName::src0_modifiers) >= 0)
       NewInstr.addImm(0);
     if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
-      MachineOperand Src = Inst.getOperand(1);
+      const MachineOperand &Src = Inst.getOperand(1);
       NewInstr->addOperand(Src);
     }
 
@@ -9199,7 +9199,7 @@ void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
 }
 
-void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+void SIInstrInfo::addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                                MachineInstr &SCCDefInst,
                                                SIInstrWorklist &Worklist,
                                                Register NewCond) const {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index dc23a21f959ce..0643b532ea04c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -172,7 +172,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
                                     SIInstrWorklist &Worklist) const;
 
-  void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
+  void addSCCDefUsersToVALUWorklist(const MachineOperand &Op,
                                     MachineInstr &SCCDefInst,
                                     SIInstrWorklist &Worklist,
                                     Register NewCond = Register()) const;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 7431e111ec862..8785968569d92 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -296,7 +296,7 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
   for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
                                          E = MI.getIterator();
        I != E; ++I) {
-    if (I->isBundle())
+    if (I->isBundle() || I->isDebugInstr())
       continue;
     switch (I->getOpcode()) {
     case AMDGPU::S_SET_GPR_IDX_MODE:
@@ -640,7 +640,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
 }
 
 void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
-  MachineOperand DstOp = I.getOperand(0);
+  const MachineOperand &DstOp = I.getOperand(0);
 
   uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
   assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4ae2c1ed04dae..31d8bce4d0c87 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1707,7 +1707,7 @@ multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string Pse
   defvar WMMAConstraints2Addr = !if(DiffVdstSrc2, "@earlyclobber $vdst", "@earlyclobber $vdst,$vdst = $src2");
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
-  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0, isConvergent = 1 in {
     let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in
       def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>, WMMAInstInfo {
         let PseudoInstr = Instr#PseudoInstrSuffix;
@@ -1734,7 +1734,7 @@ multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string P
     let mayRaiseFPException = 0;
     let ReadsModeReg = 0;
     let AsmMatchConverter = "cvtSWMMAC";
-
+    let isConvergent = 1;
     let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
   }
 }
@@ -1906,8 +1906,10 @@ defm V_WMMA_SCALE_F32_32X16X128_F4_w32   : WMMAInstGFX12<"v_wmma_scale_f32_32x16
 defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
 } // End is_wmma_xdl = 1.
 
-defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
-defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+let isConvergent = 1 in {
+  defm V_WMMA_LD_SCALE_PAIRED_B32   : VOP3PInst<"v_wmma_ld_scale_paired_b32",   VOP_WMMA_LD_SCALE<i32, VCSrc_b32_Lo256>>;
+  defm V_WMMA_LD_SCALE16_PAIRED_B64 : VOP3PInst<"v_wmma_ld_scale16_paired_b64", VOP_WMMA_LD_SCALE<i64, VCSrc_b64_Lo256>>;
+}
 } // End SubtargetPredicate = isGFX125xOnly
 } // End WaveSizePredicate = isWave32
 
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 14e1160e70dae..88d3b6f7d5bb9 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -86,7 +86,7 @@ namespace {
   // All possible address modes, plus some.
 class Address {
 public:
-  using BaseKind = enum { RegBase, FrameIndexBase };
+  enum BaseKind { RegBase, FrameIndexBase };
 
 private:
   BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 6b0653457cbaf..92fae71121a81 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -1312,8 +1312,8 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM_,
     setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   }
 
-  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
   if (Subtarget->hasFPARMv8Base()) {
@@ -9855,76 +9855,6 @@ static SDValue LowerUADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
 }
 
-SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // return values are passed via sret.
-  SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  RTLIB::Libcall LC = RTLIB::getSINCOS_STRET(ArgVT);
-  RTLIB::LibcallImpl SincosStret = getLibcallImpl(LC);
-  if (SincosStret == RTLIB::Unsupported)
-    return SDValue();
-
-  assert(Subtarget->isTargetDarwin());
-
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  auto PtrVT = getPointerTy(DAG.getDataLayout());
-
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-
-  // Pair of floats / doubles used to pass the result.
-  Type *RetTy = StructType::get(ArgTy, ArgTy);
-  auto &DL = DAG.getDataLayout();
-
-  ArgListTy Args;
-  bool ShouldUseSRet = getTM().isAPCS_ABI();
-  SDValue SRet;
-  if (ShouldUseSRet) {
-    // Create stack object for sret.
-    const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
-    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
-    int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
-    SRet = DAG.getFrameIndex(FrameIdx, getPointerTy(DL));
-
-    ArgListEntry Entry(SRet, PointerType::getUnqual(RetTy->getContext()));
-    Entry.IsSExt = false;
-    Entry.IsZExt = false;
-    Entry.IsSRet = true;
-    Args.push_back(Entry);
-    RetTy = Type::getVoidTy(*DAG.getContext());
-  }
-
-  Args.emplace_back(Arg, ArgTy);
-
-  StringRef LibcallName = getLibcallImplName(SincosStret);
-  CallingConv::ID CC = getLibcallImplCallingConv(SincosStret);
-  SDValue Callee = DAG.getExternalSymbol(LibcallName.data(), getPointerTy(DL));
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl)
-      .setChain(DAG.getEntryNode())
-      .setCallee(CC, RetTy, Callee, std::move(Args))
-      .setDiscardResult(ShouldUseSRet);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-
-  if (!ShouldUseSRet)
-    return CallResult.first;
-
-  SDValue LoadSin =
-      DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
-
-  // Address of cos field.
-  SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
-                            DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
-  SDValue LoadCos =
-      DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
-
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
-                     LoadSin.getValue(0), LoadCos.getValue(0));
-}
-
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
                                                   bool Signed,
                                                   SDValue &Chain) const {
@@ -10726,8 +10656,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECREDUCE_SMAX:
     return LowerVecReduceMinMax(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
-  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
-  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
+  case ISD::ATOMIC_STORE:
+    return LowerAtomicLoadStore(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index bf3438b0d8803..bc2fec3c1bdb5 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -901,7 +901,6 @@ class VectorType;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 9b250e6cac3ab..24f58a68c345d 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2448,7 +2448,8 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
 //
 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                  const DataLayout &DL,
-                                 const LoopAccessInfo *LAI) {
+                                 const LoopAccessInfo *LAI,
+                                 const DominatorTree &DT) {
   LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
 
   // If there are live-out values, it is probably a reduction. We can predicate
@@ -2498,7 +2499,8 @@ static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
       if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
         Value *Ptr = getLoadStorePointerOperand(&I);
         Type *AccessTy = getLoadStoreType(&I);
-        int64_t NextStride = getPtrStride(PSE, AccessTy, Ptr, L).value_or(0);
+        int64_t NextStride =
+            getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
         if (NextStride == 1) {
           // TODO: for now only allow consecutive strides of 1. We could support
           // other strides as long as it is uniform, but let's keep it simple
@@ -2585,7 +2587,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(TailFoldingInfo *TFI) const {
     return false;
   }
 
-  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI());
+  return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
+                              *LVL->getDominatorTree());
 }
 
 TailFoldingStyle
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 77dc4a75a7d68..abe081c0c76fd 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -88,6 +88,16 @@ bool BPFAsmPrinter::doFinalization(Module &M) {
     }
   }
 
+  for (GlobalObject &GO : M.global_objects()) {
+    if (!GO.hasExternalWeakLinkage())
+      continue;
+
+    if (!SawTrapCall && GO.getName() == BPF_TRAP) {
+      GO.eraseFromParent();
+      break;
+    }
+  }
+
   return AsmPrinter::doFinalization(M);
 }
 
@@ -160,6 +170,16 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
+  if (MI->isCall()) {
+    for (const MachineOperand &Op : MI->operands()) {
+      if (Op.isGlobal()) {
+        if (const GlobalValue *GV = Op.getGlobal())
+          if (GV->getName() == BPF_TRAP)
+            SawTrapCall = true;
+      }
+    }
+  }
+
   BPF_MC::verifyInstructionPredicates(MI->getOpcode(),
                                       getSubtargetInfo().getFeatureBits());
 
@@ -195,6 +215,10 @@ void BPFAsmPrinter::emitJumpTableInfo() {
 
   const TargetLoweringObjectFile &TLOF = getObjFileLowering();
   const Function &F = MF->getFunction();
+
+  MCSection *Sec = OutStreamer->getCurrentSectionOnly();
+  MCSymbol *SecStart = Sec->getBeginSymbol();
+
   MCSection *JTS = TLOF.getSectionForJumpTable(F, TM);
   assert(MJTI->getEntryKind() == MachineJumpTableInfo::EK_BlockAddress);
   unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
@@ -207,8 +231,10 @@ void BPFAsmPrinter::emitJumpTableInfo() {
     MCSymbol *JTStart = getJTPublicSymbol(JTI);
     OutStreamer->emitLabel(JTStart);
     for (const MachineBasicBlock *MBB : JTBBs) {
-      const MCExpr *LHS = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
-      OutStreamer->emitValue(LHS, EntrySize);
+      const MCExpr *Diff = MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(MBB->getSymbol(), OutContext),
+          MCSymbolRefExpr::create(SecStart, OutContext), OutContext);
+      OutStreamer->emitValue(Diff, EntrySize);
     }
     const MCExpr *JTSize =
         MCConstantExpr::create(JTBBs.size() * EntrySize, OutContext);
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h
index 90ef2073609a6..75a1d7ed9f884 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.h
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h
@@ -39,6 +39,7 @@ class BPFAsmPrinter : public AsmPrinter {
 private:
   BTFDebug *BTF;
   TargetMachine &TM;
+  bool SawTrapCall = false;
 
   const BPFTargetMachine &getBTM() const;
 };
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 8ace2d2777c74..eb4c8846441a2 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -194,9 +194,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) {
         dxbc::PSV::v2::ResourceBindInfo BindInfo;
         BindInfo.Type = Type;
         BindInfo.LowerBound = Binding.LowerBound;
-        assert(Binding.Size == UINT32_MAX ||
-               (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX &&
-                   "Resource range is too large");
+        assert(
+            (Binding.Size == UINT32_MAX ||
+             (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) &&
+            "Resource range is too large");
         BindInfo.UpperBound = (Binding.Size == UINT32_MAX)
                                   ? UINT32_MAX
                                   : Binding.LowerBound + Binding.Size - 1;
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index 7ae500a55b92d..67437f6969b27 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -1079,6 +1079,15 @@ def WaveActiveOp : DXILOp<119, waveActiveOp> {
   let attributes = [Attributes<DXIL1_0, []>];
 }
 
+def LegacyF16ToF32 : DXILOp<131, legacyF16ToF32> {
+  let Doc = "returns the float16 stored in the low-half of the uint converted "
+            "to a float";
+  let intrinsics = [IntrinSelect<int_dx_legacyf16tof32>];
+  let arguments = [Int32Ty];
+  let result = FloatTy;
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+}
+
 def WaveAllBitCount : DXILOp<135, waveAllOp> {
   let Doc = "returns the count of bits set to 1 across the wave";
   let intrinsics = [IntrinSelect<int_dx_wave_active_countbits>];
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 60dfd9650937c..6cacbf6564db2 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -29,11 +29,12 @@ bool DirectXTTIImpl::isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
                                                             int OpdIdx) const {
   switch (ID) {
   case Intrinsic::dx_asdouble:
-  case Intrinsic::dx_isinf:
-  case Intrinsic::dx_isnan:
   case Intrinsic::dx_firstbitlow:
-  case Intrinsic::dx_firstbituhigh:
   case Intrinsic::dx_firstbitshigh:
+  case Intrinsic::dx_firstbituhigh:
+  case Intrinsic::dx_isinf:
+  case Intrinsic::dx_isnan:
+  case Intrinsic::dx_legacyf16tof32:
     return OpdIdx == 0;
   default:
     return OpdIdx == -1;
@@ -50,6 +51,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   case Intrinsic::dx_frac:
   case Intrinsic::dx_isinf:
   case Intrinsic::dx_isnan:
+  case Intrinsic::dx_legacyf16tof32:
   case Intrinsic::dx_rsqrt:
   case Intrinsic::dx_saturate:
   case Intrinsic::dx_splitdouble:
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index fe700e17d341b..cf4ffc82f6009 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6630,6 +6630,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0),
                          N->getOperand(1));
     break;
+  case Intrinsic::loongarch_lasx_concat_128_s:
+  case Intrinsic::loongarch_lasx_concat_128_d:
+  case Intrinsic::loongarch_lasx_concat_128:
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index b502b056c4cdf..00d52870f1727 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2113,6 +2113,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64,  2,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8,  16, sub_128>;
 
+// LASX and LSX conversion
+def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
 } // Predicates = [HasExtLASX]
 
 /// Intrinsic pattern
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index df0c8c13fa38d..06210b6b91b93 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -82,7 +82,7 @@ class MipsFastISel final : public FastISel {
   // All possible address modes.
   class Address {
   public:
-    using BaseKind = enum { RegBase, FrameIndexBase };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index c667a09f95dbb..996d653940118 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1836,7 +1836,7 @@ bool NVPTXDAGToDAGISel::tryFence(SDNode *N) {
   return true;
 }
 
-NVPTXScopes::NVPTXScopes(LLVMContext &C) {
+NVPTXScopes::NVPTXScopes(LLVMContext &C) : Context(&C) {
   Scopes[C.getOrInsertSyncScopeID("singlethread")] = NVPTX::Scope::Thread;
   Scopes[C.getOrInsertSyncScopeID("")] = NVPTX::Scope::System;
   Scopes[C.getOrInsertSyncScopeID("block")] = NVPTX::Scope::Block;
@@ -1851,11 +1851,21 @@ NVPTX::Scope NVPTXScopes::operator[](SyncScope::ID ID) const {
 
   auto S = Scopes.find(ID);
   if (S == Scopes.end()) {
-    // TODO:
-    // - Add API to LLVMContext to get the name of a single scope.
-    // - Use that API here to print an error containing the name
-    //   of this Unknown ID.
-    report_fatal_error(formatv("Could not find scope ID={}.", int(ID)));
+    auto scopeName = Context->getSyncScopeName(ID);
+    assert(scopeName.has_value() && "Scope name must exist.");
+
+    // Build list of supported syncscopes programmatically
+    SmallVector<StringRef> supportedScopes;
+    for (const auto &Entry : Scopes) {
+      if (auto name = Context->getSyncScopeName(Entry.first))
+        supportedScopes.push_back(name->empty() ? "<empty string>" : *name);
+    }
+
+    reportFatalUsageError(
+        formatv("NVPTX backend does not support syncscope \"{0}\" (ID={1}).\n"
+                "Supported syncscopes are: {2}.",
+                scopeName.value(), int(ID),
+                make_range(supportedScopes.begin(), supportedScopes.end())));
   }
   return S->second;
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 1cb579bd96730..d525531766ddf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -35,6 +35,7 @@ struct NVPTXScopes {
 
 private:
   SmallMapVector<SyncScope::ID, NVPTX::Scope, 8> Scopes{};
+  LLVMContext *Context = nullptr;
 };
 
 class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index b26022184708c..f0bdf472b96ed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2267,7 +2267,7 @@ def : Pat<(f32 (fpround f64:$a)), (CVT_f32_f64 $a, CvtRN)>;
 def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
 def : Pat<(f32 (fpextend f16:$a)), (CVT_f32_f16 $a, CvtNONE)>;
 // fpextend bf16 -> f32
-def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE_FTZ)>, Requires<[doF32FTZ, hasPTX<78>, hasSM<90>]>;
 def : Pat<(f32 (fpextend bf16:$a)), (CVT_f32_bf16 $a, CvtNONE)>, Requires<[hasPTX<71>, hasSM<80>]>;
 
 // fpextend f16 -> f64
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index da3efdc15f1e1..0c2e44e18f463 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -360,6 +360,10 @@ let Predicates = [HasVSX, IsISAFuture] in {
     def LXVPRLL : XForm_XTp5_RAB5<31, 621, (outs vsrprc:$XTp),
                                   (ins (memr $RA):$addr, g8rc:$RB),
                                   "lxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
+    def LXVPB32X
+        : XForm_XTp5_RAB5<31, 877, (outs vsrprc:$XTp),
+                          (ins (memr $RA):$addr, g8rc:$RB),
+                          "lxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
 
   let mayStore = 1 in {
@@ -376,6 +380,10 @@ let Predicates = [HasVSX, IsISAFuture] in {
         : XForm_XTp5_RAB5<31, 749, (outs),
                           (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
                           "stxvprll $XTp, $addr, $RB", IIC_LdStLFD, []>;
+    def STXVPB32X
+        : XForm_XTp5_RAB5<31, 1005, (outs),
+                          (ins vsrprc:$XTp, (memr $RA):$addr, g8rc:$RB),
+                          "stxvpb32x $XTp, $addr, $RB", IIC_LdStLFD, []>;
   }
 
   def VUPKHSNTOB : VXForm_VRTB5<387, 0, (outs vrrc:$VRT), (ins vrrc:$VRB),
diff --git a/llvm/lib/Target/PowerPC/PPCInstrMMA.td b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
index b38dd4ae948c6..fc3cde3f464bb 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrMMA.td
@@ -202,7 +202,7 @@ multiclass ACC_UM_M244_XO46<bits<6> opcode, bits<8> xo, dag IOL, string asmbase,
     RegConstraint<"@earlyclobber $AT">;
   def PM#NAME#WPP :
     MMIRR_XX3Form_XY4P2_XAB6<
-      opcode, !or(xo, 0x20), (outs acc:$AT),
+      opcode, !or(xo, 0x20), (outs wacc:$AT),
       !con((ins wacc:$ATi),
            !con(IOL, (ins u4imm:$XMSK, u4imm:$YMSK, u2imm:$PMSK))),
       !strconcat("pm"#asmbase#"pp ", asmstr#", $XMSK, $YMSK, $PMSK"),
@@ -765,7 +765,7 @@ let Predicates = [MMA, IsISAFuture] in {
   def : Pat<(v512i1 (int_ppc_mma_xvf64gerpn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
             (XVF64GERWPN $ATi, $XA, RCCp.BToVSRC)>;
   def : Pat<(v512i1 (int_ppc_mma_xvf64gernp v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
-            (XVF64GERNP $ATi, $XA, RCCp.BToVSRC)>;
+            (XVF64GERWNP $ATi, $XA, RCCp.BToVSRC)>;
   def : Pat<(v512i1 (int_ppc_mma_xvf64gernn v512i1:$ATi, v256i1:$XA, v16i8:$XB)),
             (XVF64GERWNN $ATi, $XA, RCCp.BToVSRC)>;
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index 282cf5d681685..3d5a55c631301 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -95,7 +95,8 @@ class RISCVInstructionSelector : public InstructionSelector {
   void addVectorLoadStoreOperands(MachineInstr &I,
                                   SmallVectorImpl<SrcOp> &SrcOps,
                                   unsigned &CurOp, bool IsMasked,
-                                  bool IsStrided) const;
+                                  bool IsStridedOrIndexed,
+                                  LLT *IndexVT = nullptr) const;
   bool selectIntrinsicWithSideEffects(MachineInstr &I,
                                       MachineIRBuilder &MIB) const;
 
@@ -722,15 +723,17 @@ static unsigned selectRegImmLoadStoreOp(unsigned GenericOpc, unsigned OpSize) {
 
 void RISCVInstructionSelector::addVectorLoadStoreOperands(
     MachineInstr &I, SmallVectorImpl<SrcOp> &SrcOps, unsigned &CurOp,
-    bool IsMasked, bool IsStrided) const {
+    bool IsMasked, bool IsStridedOrIndexed, LLT *IndexVT) const {
   // Base Pointer
   auto PtrReg = I.getOperand(CurOp++).getReg();
   SrcOps.push_back(PtrReg);
 
-  // Stride
-  if (IsStrided) {
+  // Stride or Index
+  if (IsStridedOrIndexed) {
     auto StrideReg = I.getOperand(CurOp++).getReg();
     SrcOps.push_back(StrideReg);
+    if (IndexVT)
+      *IndexVT = MRI->getType(StrideReg);
   }
 
   // Mask
@@ -805,6 +808,70 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
     I.eraseFromParent();
     return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
   }
+  case Intrinsic::riscv_vloxei:
+  case Intrinsic::riscv_vloxei_mask:
+  case Intrinsic::riscv_vluxei:
+  case Intrinsic::riscv_vluxei_mask: {
+    bool IsMasked = IntrinID == Intrinsic::riscv_vloxei_mask ||
+                    IntrinID == Intrinsic::riscv_vluxei_mask;
+    bool IsOrdered = IntrinID == Intrinsic::riscv_vloxei ||
+                     IntrinID == Intrinsic::riscv_vloxei_mask;
+    LLT VT = MRI->getType(I.getOperand(0).getReg());
+    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+    // Result vector
+    const Register DstReg = I.getOperand(0).getReg();
+
+    // Sources
+    bool HasPassthruOperand = IntrinID != Intrinsic::riscv_vlm;
+    unsigned CurOp = 2;
+    SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+    // Passthru
+    if (HasPassthruOperand) {
+      auto PassthruReg = I.getOperand(CurOp++).getReg();
+      SrcOps.push_back(PassthruReg);
+    } else {
+      // Use NoRegister if there is no specified passthru.
+      SrcOps.push_back(Register());
+    }
+    LLT IndexVT;
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT);
+
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+    RISCVVType::VLMUL IndexLMUL =
+        RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT));
+    unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+    if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+      reportFatalUsageError("The V extension does not support EEW=64 for index "
+                            "values when XLEN=32");
+    }
+    const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo(
+        IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+        static_cast<unsigned>(IndexLMUL));
+
+    auto PseudoMI = MIB.buildInstr(P->Pseudo, {DstReg}, SrcOps);
+
+    // Select VL
+    auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+    for (auto &RenderFn : *VLOpFn)
+      RenderFn(PseudoMI);
+
+    // SEW
+    PseudoMI.addImm(Log2SEW);
+
+    // Policy
+    uint64_t Policy = RISCVVType::MASK_AGNOSTIC;
+    if (IsMasked)
+      Policy = I.getOperand(CurOp++).getImm();
+    PseudoMI.addImm(Policy);
+
+    // Memref
+    PseudoMI.cloneMemRefs(I);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+  }
   case Intrinsic::riscv_vsm:
   case Intrinsic::riscv_vse:
   case Intrinsic::riscv_vse_mask:
@@ -847,6 +914,56 @@ bool RISCVInstructionSelector::selectIntrinsicWithSideEffects(
     I.eraseFromParent();
     return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
   }
+  case Intrinsic::riscv_vsoxei:
+  case Intrinsic::riscv_vsoxei_mask:
+  case Intrinsic::riscv_vsuxei:
+  case Intrinsic::riscv_vsuxei_mask: {
+    bool IsMasked = IntrinID == Intrinsic::riscv_vsoxei_mask ||
+                    IntrinID == Intrinsic::riscv_vsuxei_mask;
+    bool IsOrdered = IntrinID == Intrinsic::riscv_vsoxei ||
+                     IntrinID == Intrinsic::riscv_vsoxei_mask;
+    LLT VT = MRI->getType(I.getOperand(1).getReg());
+    unsigned Log2SEW = Log2_32(VT.getScalarSizeInBits());
+
+    // Sources
+    unsigned CurOp = 1;
+    SmallVector<SrcOp, 4> SrcOps; // Source registers.
+
+    // Store value
+    auto PassthruReg = I.getOperand(CurOp++).getReg();
+    SrcOps.push_back(PassthruReg);
+
+    LLT IndexVT;
+    addVectorLoadStoreOperands(I, SrcOps, CurOp, IsMasked, true, &IndexVT);
+
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(VT));
+    RISCVVType::VLMUL IndexLMUL =
+        RISCVTargetLowering::getLMUL(getMVTForLLT(IndexVT));
+    unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
+    if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
+      reportFatalUsageError("The V extension does not support EEW=64 for index "
+                            "values when XLEN=32");
+    }
+    const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo(
+        IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
+        static_cast<unsigned>(IndexLMUL));
+
+    auto PseudoMI = MIB.buildInstr(P->Pseudo, {}, SrcOps);
+
+    // Select VL
+    auto VLOpFn = renderVLOp(I.getOperand(CurOp++));
+    for (auto &RenderFn : *VLOpFn)
+      RenderFn(PseudoMI);
+
+    // SEW
+    PseudoMI.addImm(Log2SEW);
+
+    // Memref
+    PseudoMI.cloneMemRefs(I);
+
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*PseudoMI, TII, TRI, RBI);
+  }
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index e75dfe33814c6..5b8cfb2100b26 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -407,7 +407,6 @@ enum OperandType : unsigned {
   OPERAND_SIMM5_PLUS1,
   OPERAND_SIMM6,
   OPERAND_SIMM6_NONZERO,
-  OPERAND_SIMM8,
   OPERAND_SIMM8_UNSIGNED,
   OPERAND_SIMM10,
   OPERAND_SIMM10_LSB0000_NONZERO,
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 526675a682d86..b0453fc57c053 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -131,6 +131,7 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   case RISCV::PseudoCCMAXU:
   case RISCV::PseudoCCMIN:
   case RISCV::PseudoCCMINU:
+  case RISCV::PseudoCCMUL:
   case RISCV::PseudoCCADDW:
   case RISCV::PseudoCCSUBW:
   case RISCV::PseudoCCSLL:
@@ -237,6 +238,7 @@ bool RISCVExpandPseudo::expandCCOp(MachineBasicBlock &MBB,
     case RISCV::PseudoCCMIN:   NewOpc = RISCV::MIN;   break;
     case RISCV::PseudoCCMAXU:  NewOpc = RISCV::MAXU;  break;
     case RISCV::PseudoCCMINU:  NewOpc = RISCV::MINU;  break;
+    case RISCV::PseudoCCMUL:   NewOpc = RISCV::MUL;   break;
     case RISCV::PseudoCCADDI:  NewOpc = RISCV::ADDI;  break;
     case RISCV::PseudoCCSLLI:  NewOpc = RISCV::SLLI;  break;
     case RISCV::PseudoCCSRLI:  NewOpc = RISCV::SRLI;  break;
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index cfee6ab22d4ff..5b72334f58d45 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1856,6 +1856,11 @@ def TuneShortForwardBranchIMinMax
                        "true", "Enable short forward branch optimization for min,max instructions in Zbb",
                        [TuneShortForwardBranchOpt]>;
 
+def TuneShortForwardBranchIMul
+    : SubtargetFeature<"short-forward-branch-i-mul", "HasShortForwardBranchIMul",
+                       "true", "Enable short forward branch optimization for mul instruction",
+                       [TuneShortForwardBranchOpt]>;
+
 // Some subtargets require a S2V transfer buffer to move scalars into vectors.
 // FIXME: Forming .vx/.vf/.wx/.wf can reduce register pressure.
 def TuneNoSinkSplatOperands
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index b25a05400fe31..907833513c5d1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -371,8 +371,8 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked,
   RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
   if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-    report_fatal_error("The V extension does not support EEW=64 for index "
-                       "values when XLEN=32");
+    reportFatalUsageError("The V extension does not support EEW=64 for index "
+                          "values when XLEN=32");
   }
   const RISCV::VLXSEGPseudo *P = RISCV::getVLXSEGPseudo(
       NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -444,8 +444,8 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked,
   RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
   if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-    report_fatal_error("The V extension does not support EEW=64 for index "
-                       "values when XLEN=32");
+    reportFatalUsageError("The V extension does not support EEW=64 for index "
+                          "values when XLEN=32");
   }
   const RISCV::VSXSEGPseudo *P = RISCV::getVSXSEGPseudo(
       NF, IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -2223,8 +2223,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
       if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-        report_fatal_error("The V extension does not support EEW=64 for index "
-                           "values when XLEN=32");
+        reportFatalUsageError("The V extension does not support EEW=64 for "
+                              "index values when XLEN=32");
       }
       const RISCV::VLX_VSXPseudo *P = RISCV::getVLXPseudo(
           IsMasked, IsOrdered, IndexLog2EEW, static_cast<unsigned>(LMUL),
@@ -2457,8 +2457,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
       if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
-        report_fatal_error("The V extension does not support EEW=64 for index "
-                           "values when XLEN=32");
+        reportFatalUsageError("The V extension does not support EEW=64 for "
+                              "index values when XLEN=32");
       }
       const RISCV::VLX_VSXPseudo *P = RISCV::getVSXPseudo(
           IsMasked, IsOrdered, IndexLog2EEW,
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e0cf739f67d9b..c3f100e3197b1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -9186,7 +9186,7 @@ static SDValue lowerSelectToBinOp(SDNode *N, SelectionDAG &DAG,
           unsigned ShAmount = Log2_64(TrueM1);
           if (Subtarget.hasShlAdd(ShAmount))
             return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, CondV,
-                               DAG.getConstant(ShAmount, DL, VT), CondV);
+                               DAG.getTargetConstant(ShAmount, DL, VT), CondV);
         }
       }
       // (select c, y, 0) -> -c & y
@@ -15463,7 +15463,7 @@ static SDValue transformAddShlImm(SDNode *N, SelectionDAG &DAG,
   SDValue NS = (C0 < C1) ? N0->getOperand(0) : N1->getOperand(0);
   SDValue NL = (C0 > C1) ? N0->getOperand(0) : N1->getOperand(0);
   SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, NL,
-                              DAG.getConstant(Diff, DL, VT), NS);
+                              DAG.getTargetConstant(Diff, DL, VT), NS);
   return DAG.getNode(ISD::SHL, DL, VT, SHADD, DAG.getConstant(Bits, DL, VT));
 }
 
@@ -15501,7 +15501,7 @@ static SDValue combineShlAddIAddImpl(SDNode *N, SDValue AddI, SDValue Other,
   int64_t AddConst = AddVal.getSExtValue();
 
   SDValue SHADD = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, SHLVal->getOperand(0),
-                              DAG.getConstant(ShlConst, DL, VT), Other);
+                              DAG.getTargetConstant(ShlConst, DL, VT), Other);
   return DAG.getNode(ISD::ADD, DL, VT, SHADD,
                      DAG.getSignedConstant(AddConst, DL, VT));
 }
@@ -16495,6 +16495,35 @@ static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(Op, DL, VT, Shift1, Shift2);
 }
 
+static SDValue getShlAddShlAdd(SDNode *N, SelectionDAG &DAG, unsigned ShX,
+                               unsigned ShY) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue X = N->getOperand(0);
+  SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                               DAG.getTargetConstant(ShY, DL, VT), X);
+  return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
+                     DAG.getTargetConstant(ShX, DL, VT), Mul359);
+}
+
+static SDValue expandMulToShlAddShlAdd(SDNode *N, SelectionDAG &DAG,
+                                       uint64_t MulAmt) {
+  switch (MulAmt) {
+  case 5 * 3:
+    return getShlAddShlAdd(N, DAG, 2, 1);
+  case 9 * 3:
+    return getShlAddShlAdd(N, DAG, 3, 1);
+  case 5 * 5:
+    return getShlAddShlAdd(N, DAG, 2, 2);
+  case 9 * 5:
+    return getShlAddShlAdd(N, DAG, 3, 2);
+  case 9 * 9:
+    return getShlAddShlAdd(N, DAG, 3, 3);
+  default:
+    return SDValue();
+  }
+}
+
 // Try to expand a scalar multiply to a faster sequence.
 static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
@@ -16524,18 +16553,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   if (Subtarget.hasVendorXqciac() && isInt<12>(CNode->getSExtValue()))
     return SDValue();
 
-  // WARNING: The code below is knowingly incorrect with regards to undef semantics.
-  // We're adding additional uses of X here, and in principle, we should be freezing
-  // X before doing so.  However, adding freeze here causes real regressions, and no
-  // other target properly freezes X in these cases either.
-  SDValue X = N->getOperand(0);
-
+  // WARNING: The code below is knowingly incorrect with regards to undef
+  // semantics.  We're adding additional uses of X here, and in principle, we
+  // should be freezing X before doing so.  However, adding freeze here causes
+  // real regressions, and no other target properly freezes X in these cases
+  // either.
   if (Subtarget.hasShlAdd(3)) {
+    SDValue X = N->getOperand(0);
     int Shift;
     if (int ShXAmount = isShifted359(MulAmt, Shift)) {
       // 3/5/9 * 2^N -> shl (shXadd X, X), N
       SDLoc DL(N);
-      SDValue X = N->getOperand(0);
       // Put the shift first if we can fold a zext into the shift forming
       // a slli.uw.
       if (X.getOpcode() == ISD::AND && isa<ConstantSDNode>(X.getOperand(1)) &&
@@ -16543,49 +16571,19 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         SDValue Shl =
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(Shift, DL, VT));
         return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Shl,
-                           DAG.getConstant(ShXAmount, DL, VT), Shl);
+                           DAG.getTargetConstant(ShXAmount, DL, VT), Shl);
       }
       // Otherwise, put the shl second so that it can fold with following
       // instructions (e.g. sext or add).
       SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                   DAG.getConstant(ShXAmount, DL, VT), X);
+                                   DAG.getTargetConstant(ShXAmount, DL, VT), X);
       return DAG.getNode(ISD::SHL, DL, VT, Mul359,
                          DAG.getConstant(Shift, DL, VT));
     }
 
     // 3/5/9 * 3/5/9 -> shXadd (shYadd X, X), (shYadd X, X)
-    int ShX;
-    int ShY;
-    switch (MulAmt) {
-    case 3 * 5:
-      ShY = 1;
-      ShX = 2;
-      break;
-    case 3 * 9:
-      ShY = 1;
-      ShX = 3;
-      break;
-    case 5 * 5:
-      ShX = ShY = 2;
-      break;
-    case 5 * 9:
-      ShY = 2;
-      ShX = 3;
-      break;
-    case 9 * 9:
-      ShX = ShY = 3;
-      break;
-    default:
-      ShX = ShY = 0;
-      break;
-    }
-    if (ShX) {
-      SDLoc DL(N);
-      SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                   DAG.getConstant(ShY, DL, VT), X);
-      return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                         DAG.getConstant(ShX, DL, VT), Mul359);
-    }
+    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt))
+      return V;
 
     // If this is a power 2 + 2/4/8, we can use a shift followed by a single
     // shXadd. First check if this a sum of two power of 2s because that's
@@ -16598,7 +16596,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         SDValue Shift1 =
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
         return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                           DAG.getConstant(ScaleShift, DL, VT), Shift1);
+                           DAG.getTargetConstant(ScaleShift, DL, VT), Shift1);
       }
     }
 
@@ -16611,10 +16609,11 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
       assert(Shift != 0 && "MulAmt=4,6,10 handled before");
       if (Shift <= 3) {
         SDLoc DL(N);
-        SDValue Mul359 = DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                     DAG.getConstant(ShXAmount, DL, VT), X);
+        SDValue Mul359 =
+            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                        DAG.getTargetConstant(ShXAmount, DL, VT), X);
         return DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359,
-                           DAG.getConstant(Shift, DL, VT), X);
+                           DAG.getTargetConstant(Shift, DL, VT), X);
       }
     }
 
@@ -16626,9 +16625,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
         SDLoc DL(N);
         SDValue Shift1 =
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShiftAmt, DL, VT));
-        return DAG.getNode(ISD::ADD, DL, VT, Shift1,
-                           DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                                       DAG.getConstant(ScaleShift, DL, VT), X));
+        return DAG.getNode(
+            ISD::ADD, DL, VT, Shift1,
+            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
+                        DAG.getTargetConstant(ScaleShift, DL, VT), X));
       }
     }
 
@@ -16643,28 +16643,17 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
             DAG.getNode(ISD::SHL, DL, VT, X, DAG.getConstant(ShAmt, DL, VT));
         SDValue Mul359 =
             DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                        DAG.getConstant(Log2_64(Offset - 1), DL, VT), X);
+                        DAG.getTargetConstant(Log2_64(Offset - 1), DL, VT), X);
         return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
       }
     }
 
-    for (uint64_t Divisor : {3, 5, 9}) {
-      if (MulAmt % Divisor != 0)
-        continue;
-      uint64_t MulAmt2 = MulAmt / Divisor;
-      // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
-      // of 25 which happen to be quite common.
-      if (int ShBAmount = isShifted359(MulAmt2, Shift)) {
-        SDLoc DL(N);
-        SDValue Mul359A =
-            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, X,
-                        DAG.getConstant(Log2_64(Divisor - 1), DL, VT), X);
-        SDValue Mul359B =
-            DAG.getNode(RISCVISD::SHL_ADD, DL, VT, Mul359A,
-                        DAG.getConstant(ShBAmount, DL, VT), Mul359A);
-        return DAG.getNode(ISD::SHL, DL, VT, Mul359B,
-                           DAG.getConstant(Shift, DL, VT));
-      }
+    // 3/5/9 * 3/5/9 * 2^N - In particular, this covers multiples
+    // of 25 which happen to be quite common.
+    Shift = llvm::countr_zero(MulAmt);
+    if (SDValue V = expandMulToShlAddShlAdd(N, DAG, MulAmt >> Shift)) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Shift, DL, VT));
     }
   }
 
@@ -25320,3 +25309,12 @@ ArrayRef<MCPhysReg> RISCVTargetLowering::getRoundingControlRegisters() const {
   }
   return {};
 }
+
+bool RISCVTargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  if (VT.isVector())
+    return false;
+
+  return VT.getSizeInBits() <= Subtarget.getXLen();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 9e3e2a9443625..dd62a9cf6c9e2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -465,6 +465,8 @@ class RISCVTargetLowering : public TargetLowering {
 
   ArrayRef<MCPhysReg> getRoundingControlRegisters() const override;
 
+  bool shouldFoldMaskToVariableShiftPair(SDValue Y) const override;
+
   /// Match a mask which "spreads" the leading elements of a vector evenly
   /// across the result.  Factor is the spread amount, and Index is the
   /// offset applied.
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index c9df787e0012d..b8ab70bd9e386 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1703,6 +1703,7 @@ unsigned getPredicatedOpcode(unsigned Opcode) {
   case RISCV::MAXU:  return RISCV::PseudoCCMAXU;
   case RISCV::MIN:   return RISCV::PseudoCCMIN;
   case RISCV::MINU:  return RISCV::PseudoCCMINU;
+  case RISCV::MUL:   return RISCV::PseudoCCMUL;
 
   case RISCV::ADDI:  return RISCV::PseudoCCADDI;
   case RISCV::SLLI:  return RISCV::PseudoCCSLLI;
@@ -1754,6 +1755,9 @@ static MachineInstr *canFoldAsPredicatedOp(Register Reg,
        MI->getOpcode() == RISCV::MINU || MI->getOpcode() == RISCV::MAXU))
     return nullptr;
 
+  if (!STI.hasShortForwardBranchIMul() && MI->getOpcode() == RISCV::MUL)
+    return nullptr;
+
   // Check if MI can be predicated and folded into the CCMOV.
   if (getPredicatedOpcode(MI->getOpcode()) == RISCV::INSTRUCTION_LIST_END)
     return nullptr;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
index 5a67a5aaba293..494b1c9f98839 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoSFB.td
@@ -110,6 +110,7 @@ def PseudoCCMAX : SFBALU_rr;
 def PseudoCCMIN : SFBALU_rr;
 def PseudoCCMAXU : SFBALU_rr;
 def PseudoCCMINU : SFBALU_rr;
+def PseudoCCMUL : SFBALU_rr;
 
 def PseudoCCADDI : SFBALU_ri;
 def PseudoCCANDI : SFBALU_ri;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index b37ceaaee9cf4..c2b25c6294019 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -60,6 +60,8 @@ def immfour : RISCVOp {
   let DecoderMethod = "decodeImmFourOperand";
 }
 
+def tuimm2 : TImmLeaf<XLenVT, [{return isUInt<2>(Imm);}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction class templates
 //===----------------------------------------------------------------------===//
@@ -557,8 +559,8 @@ multiclass VPatTernaryVMAQA_VV_VX<string intrinsic, string instruction,
 let Predicates = [HasVendorXTHeadBa] in {
 def : Pat<(add_like_non_imm12 (shl GPR:$rs2, uimm2:$uimm2), (XLenVT GPR:$rs1)),
           (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
-def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, uimm2:$uimm2, GPR:$rs1)),
-          (TH_ADDSL GPR:$rs1, GPR:$rs2, uimm2:$uimm2)>;
+def : Pat<(XLenVT (riscv_shl_add GPR:$rs2, tuimm2:$uimm2, GPR:$rs1)),
+          (TH_ADDSL GPR:$rs1, GPR:$rs2, tuimm2:$uimm2)>;
 
 // Reuse complex patterns from StdExtZba
 def : Pat<(add_like_non_imm12 sh1add_op:$rs2, (XLenVT GPR:$rs1)),
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 4537bfe8025ca..8376da52be53e 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -53,6 +53,8 @@ def uimm5gt3 : RISCVOp<XLenVT>, ImmLeaf<XLenVT,
   let OperandType = "OPERAND_UIMM5_GT3";
 }
 
+def tuimm5gt3 : TImmLeaf<XLenVT, [{return (Imm > 3) && isUInt<5>(Imm);}]>;
+
 def UImm5Plus1AsmOperand : AsmOperandClass {
   let Name = "UImm5Plus1";
   let RenderMethod = "addImmOperands";
@@ -1419,8 +1421,8 @@ def : Pat<(i32 (add GPRNoX0:$rd, (mul GPRNoX0:$rs1, simm12_lo:$imm12))),
           (QC_MULIADD GPRNoX0:$rd, GPRNoX0:$rs1, simm12_lo:$imm12)>;
 def : Pat<(i32 (add_like_non_imm12 (shl GPRNoX0:$rs1, (i32 uimm5gt3:$imm)), GPRNoX0:$rs2)),
           (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
-def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 uimm5gt3:$imm), GPRNoX0:$rs2)),
-          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, uimm5gt3:$imm)>;
+def : Pat<(i32 (riscv_shl_add GPRNoX0:$rs1, (i32 tuimm5gt3:$imm), GPRNoX0:$rs2)),
+          (QC_SHLADD GPRNoX0:$rs1, GPRNoX0:$rs2, tuimm5gt3:$imm)>;
 } // Predicates = [HasVendorXqciac, IsRV32]
 
 /// Simple arithmetic operations
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index c31713e967b18..1c6a5afcda49b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -90,6 +90,7 @@ defvar ZfhminDExts = [ZfhminDExt, ZhinxminZdinxExt, ZhinxminZdinx32Ext];
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasHalfFPLoadStoreMove] in {
+let canFoldAsLoad = 1 in
 def FLH : FPLoad_r<0b001, "flh", FPR16, WriteFLD16>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
diff --git a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
index 640b014646f36..0175f2fb3698b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVAsmPrinter.cpp
@@ -577,6 +577,11 @@ void SPIRVAsmPrinter::outputExecutionMode(const Module &M) {
     if (MDNode *Node = F.getMetadata("intel_reqd_sub_group_size"))
       outputExecutionModeFromMDNode(FReg, Node,
                                     SPIRV::ExecutionMode::SubgroupSize, 0, 0);
+    if (MDNode *Node = F.getMetadata("max_work_group_size")) {
+      if (ST->canUseExtension(SPIRV::Extension::SPV_INTEL_kernel_attributes))
+        outputExecutionModeFromMDNode(
+            FReg, Node, SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, 3, 1);
+    }
     if (MDNode *Node = F.getMetadata("vec_type_hint")) {
       MCInst Inst;
       Inst.setOpcode(SPIRV::OpExecutionMode);
diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 9e11c3a281a1b..dd57b74d79a5e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -149,23 +149,23 @@ static FunctionType *getOriginalFunctionType(const Function &F) {
         return isa<MDString>(N->getOperand(0)) &&
                cast<MDString>(N->getOperand(0))->getString() == F.getName();
       });
-  // TODO: probably one function can have numerous type mutations,
-  // so we should support this.
   if (ThisFuncMDIt != NamedMD->op_end()) {
     auto *ThisFuncMD = *ThisFuncMDIt;
-    MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(1));
-    assert(MD && "MDNode operand is expected");
-    ConstantInt *Const = getConstInt(MD, 0);
-    if (Const) {
-      auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
-      assert(CMeta && "ConstantAsMetadata operand is expected");
-      assert(Const->getSExtValue() >= -1);
-      // Currently -1 indicates return value, greater values mean
-      // argument numbers.
-      if (Const->getSExtValue() == -1)
-        RetTy = CMeta->getType();
-      else
-        ArgTypes[Const->getSExtValue()] = CMeta->getType();
+    for (unsigned I = 1; I != ThisFuncMD->getNumOperands(); ++I) {
+      MDNode *MD = dyn_cast<MDNode>(ThisFuncMD->getOperand(I));
+      assert(MD && "MDNode operand is expected");
+      ConstantInt *Const = getConstInt(MD, 0);
+      if (Const) {
+        auto *CMeta = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
+        assert(CMeta && "ConstantAsMetadata operand is expected");
+        assert(Const->getSExtValue() >= -1);
+        // Currently -1 indicates return value, greater values mean
+        // argument numbers.
+        if (Const->getSExtValue() == -1)
+          RetTy = CMeta->getType();
+        else
+          ArgTypes[Const->getSExtValue()] = CMeta->getType();
+      }
     }
   }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index 96f5dee21bc2a..43b2869cecdf7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -107,6 +107,8 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
          SPIRV::Extension::Extension::SPV_INTEL_inline_assembly},
         {"SPV_INTEL_bindless_images",
          SPIRV::Extension::Extension::SPV_INTEL_bindless_images},
+        {"SPV_INTEL_bfloat16_arithmetic",
+         SPIRV::Extension::Extension::SPV_INTEL_bfloat16_arithmetic},
         {"SPV_INTEL_bfloat16_conversion",
          SPIRV::Extension::Extension::SPV_INTEL_bfloat16_conversion},
         {"SPV_KHR_subgroup_rotate",
@@ -155,7 +157,9 @@ static const std::map<std::string, SPIRV::Extension::Extension, std::less<>>
         {"SPV_INTEL_predicated_io",
          SPIRV::Extension::Extension::SPV_INTEL_predicated_io},
         {"SPV_KHR_maximal_reconvergence",
-         SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence}};
+         SPIRV::Extension::Extension::SPV_KHR_maximal_reconvergence},
+        {"SPV_INTEL_kernel_attributes",
+         SPIRV::Extension::Extension::SPV_INTEL_kernel_attributes}};
 
 bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   StringRef ArgValue,
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 3f0424f436c72..245e5a2894604 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -3516,6 +3516,10 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
   case Intrinsic::spv_resource_nonuniformindex: {
     return selectResourceNonUniformIndex(ResVReg, ResType, I);
   }
+  case Intrinsic::spv_unpackhalf2x16: {
+    return selectExtInst(ResVReg, ResType, I, GL::UnpackHalf2x16);
+  }
+
   default: {
     std::string DiagMsg;
     raw_string_ostream OS(DiagMsg);
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index db036a55ee6c6..e5ac76c405841 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1435,6 +1435,8 @@ void addInstrRequirements(const MachineInstr &MI,
       addPrintfRequirements(MI, Reqs, ST);
       break;
     }
+    // TODO: handle bfloat16 extended instructions when
+    // SPV_INTEL_bfloat16_arithmetic is enabled.
     break;
   }
   case SPIRV::OpAliasDomainDeclINTEL:
@@ -2060,7 +2062,64 @@ void addInstrRequirements(const MachineInstr &MI,
     Reqs.addCapability(SPIRV::Capability::PredicatedIOINTEL);
     break;
   }
-
+  case SPIRV::OpFAddS:
+  case SPIRV::OpFSubS:
+  case SPIRV::OpFMulS:
+  case SPIRV::OpFDivS:
+  case SPIRV::OpFRemS:
+  case SPIRV::OpFMod:
+  case SPIRV::OpFNegate:
+  case SPIRV::OpFAddV:
+  case SPIRV::OpFSubV:
+  case SPIRV::OpFMulV:
+  case SPIRV::OpFDivV:
+  case SPIRV::OpFRemV:
+  case SPIRV::OpFNegateV: {
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    SPIRVType *TypeDef = MRI.getVRegDef(MI.getOperand(1).getReg());
+    if (TypeDef->getOpcode() == SPIRV::OpTypeVector)
+      TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
+    if (isBFloat16Type(TypeDef)) {
+      if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic))
+        report_fatal_error(
+            "Arithmetic instructions with bfloat16 arguments require the "
+            "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic",
+            false);
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic);
+      Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL);
+    }
+    break;
+  }
+  case SPIRV::OpOrdered:
+  case SPIRV::OpUnordered:
+  case SPIRV::OpFOrdEqual:
+  case SPIRV::OpFOrdNotEqual:
+  case SPIRV::OpFOrdLessThan:
+  case SPIRV::OpFOrdLessThanEqual:
+  case SPIRV::OpFOrdGreaterThan:
+  case SPIRV::OpFOrdGreaterThanEqual:
+  case SPIRV::OpFUnordEqual:
+  case SPIRV::OpFUnordNotEqual:
+  case SPIRV::OpFUnordLessThan:
+  case SPIRV::OpFUnordLessThanEqual:
+  case SPIRV::OpFUnordGreaterThan:
+  case SPIRV::OpFUnordGreaterThanEqual: {
+    const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+    MachineInstr *OperandDef = MRI.getVRegDef(MI.getOperand(2).getReg());
+    SPIRVType *TypeDef = MRI.getVRegDef(OperandDef->getOperand(1).getReg());
+    if (TypeDef->getOpcode() == SPIRV::OpTypeVector)
+      TypeDef = MRI.getVRegDef(TypeDef->getOperand(1).getReg());
+    if (isBFloat16Type(TypeDef)) {
+      if (!ST.canUseExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic))
+        report_fatal_error(
+            "Relational instructions with bfloat16 arguments require the "
+            "following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic",
+            false);
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_bfloat16_arithmetic);
+      Reqs.addCapability(SPIRV::Capability::BFloat16ArithmeticINTEL);
+    }
+    break;
+  }
   default:
     break;
   }
@@ -2180,6 +2239,10 @@ static void collectReqs(const Module &M, SPIRV::ModuleAnalysisInfo &MAI,
       MAI.Reqs.getAndAddRequirements(
           SPIRV::OperandCategory::ExecutionModeOperand,
           SPIRV::ExecutionMode::SubgroupSize, ST);
+    if (F.getMetadata("max_work_group_size"))
+      MAI.Reqs.getAndAddRequirements(
+          SPIRV::OperandCategory::ExecutionModeOperand,
+          SPIRV::ExecutionMode::MaxWorkgroupSizeINTEL, ST);
     if (F.getMetadata("vec_type_hint"))
       MAI.Reqs.getAndAddRequirements(
           SPIRV::OperandCategory::ExecutionModeOperand,
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index ba09692fec515..ad6c9cd421b7c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -70,7 +70,6 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
     SPIRVVersion = VersionTuple(1, 3);
     break;
   case Triple::SPIRVSubArch_v14:
-  default:
     SPIRVVersion = VersionTuple(1, 4);
     break;
   case Triple::SPIRVSubArch_v15:
@@ -79,13 +78,19 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
   case Triple::SPIRVSubArch_v16:
     SPIRVVersion = VersionTuple(1, 6);
     break;
+  default:
+    if (TT.getVendor() == Triple::AMD)
+      SPIRVVersion = VersionTuple(1, 6);
+    else
+      SPIRVVersion = VersionTuple(1, 4);
   }
   OpenCLVersion = VersionTuple(2, 2);
 
   // Set the environment based on the target triple.
   if (TargetTriple.getOS() == Triple::Vulkan)
     Env = Shader;
-  else if (TargetTriple.getEnvironment() == Triple::OpenCL)
+  else if (TargetTriple.getEnvironment() == Triple::OpenCL ||
+           TargetTriple.getVendor() == Triple::AMD)
     Env = Kernel;
   else
     Env = Unknown;
@@ -93,6 +98,8 @@ SPIRVSubtarget::SPIRVSubtarget(const Triple &TT, const std::string &CPU,
   // Set the default extensions based on the target triple.
   if (TargetTriple.getVendor() == Triple::Intel)
     Extensions.insert(SPIRV::Extension::SPV_INTEL_function_pointers);
+  if (TargetTriple.getVendor() == Triple::AMD)
+    Extensions = SPIRVExtensionsParser::getValidExtensions(TargetTriple);
 
   // The order of initialization is important.
   initAvailableExtensions(Extensions);
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 7d08b29a51a6e..1b4b29bbb160a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -387,6 +387,8 @@ defm SPV_INTEL_tensor_float32_conversion : ExtensionOperand<125, [EnvOpenCL]>;
 defm SPV_KHR_bfloat16 : ExtensionOperand<126, [EnvVulkan, EnvOpenCL]>;
 defm SPV_INTEL_predicated_io : ExtensionOperand<127, [EnvOpenCL]>;
 defm SPV_KHR_maximal_reconvergence : ExtensionOperand<128, [EnvVulkan]>;
+defm SPV_INTEL_bfloat16_arithmetic
+    : ExtensionOperand<129, [EnvVulkan, EnvOpenCL]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define Capabilities enum values and at the same time
@@ -570,6 +572,7 @@ defm AtomicFloat64MinMaxEXT : CapabilityOperand<5613, 0, 0, [SPV_EXT_shader_atom
 defm VariableLengthArrayINTEL : CapabilityOperand<5817, 0, 0, [SPV_INTEL_variable_length_array], []>;
 defm GroupUniformArithmeticKHR : CapabilityOperand<6400, 0, 0, [SPV_KHR_uniform_group_instructions], []>;
 defm USMStorageClassesINTEL : CapabilityOperand<5935, 0, 0, [SPV_INTEL_usm_storage_classes], [Kernel]>;
+defm BFloat16ArithmeticINTEL : CapabilityOperand<6226, 0, 0, [SPV_INTEL_bfloat16_arithmetic], []>;
 defm BFloat16ConversionINTEL : CapabilityOperand<6115, 0, 0, [SPV_INTEL_bfloat16_conversion], []>;
 defm GlobalVariableHostAccessINTEL : CapabilityOperand<6187, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
 defm HostAccessINTEL : CapabilityOperand<6188, 0, 0, [SPV_INTEL_global_variable_host_access], []>;
@@ -587,6 +590,11 @@ defm CooperativeMatrixBFloat16ComponentTypeINTEL : CapabilityOperand<6437, 0, 0,
 defm RoundToInfinityINTEL : CapabilityOperand<5582, 0, 0, [SPV_INTEL_float_controls2], []>;
 defm FloatingPointModeINTEL : CapabilityOperand<5583, 0, 0, [SPV_INTEL_float_controls2], []>;
 defm FunctionFloatControlINTEL : CapabilityOperand<5821, 0, 0, [SPV_INTEL_float_controls2], []>;
+defm KernelAttributesINTEL : CapabilityOperand<5892, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// TODO-SPIRV: add these once they are used / tested.
+// defm FPGAKernelAttributesINTEL : CapabilityOperand<5897, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// defm FPGAKernelAttributesv2INTEL : CapabilityOperand<6161, 0, 0, [SPV_INTEL_kernel_attributes], [Kernel]>;
+// END TODO-SPIRV
 defm LongCompositesINTEL : CapabilityOperand<6089, 0, 0, [SPV_INTEL_long_composites], []>;
 defm BindlessImagesINTEL : CapabilityOperand<6528, 0, 0, [SPV_INTEL_bindless_images], []>;
 defm MemoryAccessAliasingINTEL : CapabilityOperand<5910, 0, 0, [SPV_INTEL_memory_access_aliasing], []>;
@@ -805,6 +813,15 @@ defm RoundingModeRTPINTEL : ExecutionModeOperand<5620, [RoundToInfinityINTEL]>;
 defm RoundingModeRTNINTEL : ExecutionModeOperand<5621, [RoundToInfinityINTEL]>;
 defm FloatingPointModeALTINTEL : ExecutionModeOperand<5622, [FloatingPointModeINTEL]>;
 defm FloatingPointModeIEEEINTEL : ExecutionModeOperand<5623, [FloatingPointModeINTEL]>;
+defm MaxWorkgroupSizeINTEL : ExecutionModeOperand<5893, [KernelAttributesINTEL]>;
+// TODO-SPIRV: Add the following once they are used / tested.
+// defm MaxWorkDimINTEL : ExecutionModeOperand<5894, [KernelAttributesINTEL]>;
+// defm NoGlobalOffsetINTEL : ExecutionModeOperand<5895, [KernelAttributesINTEL]>;
+// defm NumSIMDWorkitemsINTEL : ExecutionModeOperand<5896, [FPGAKernelAttributesINTEL]>;
+// defm SchedulerTargetFmaxMhzINTEL : ExecutionModeOperand<5903, [FPGAKernelAttributesINTEL]>;
+// defm StreamingInterfaceINTEL : ExecutionModeOperand<6154, [FPGAKernelAttributesv2INTEL]>;
+// defm RegisterMapInterfaceINTEL : ExecutionModeOperand<6160, [FPGAKernelAttributesv2INTEL]>;
+// END TODO-SPIRV
 defm FPFastMathDefault : ExecutionModeOperand<6028, [FloatControls2]>;
 defm MaximallyReconvergesKHR : ExecutionModeOperand<6023, [Shader]>;
 
@@ -1919,7 +1936,7 @@ defm GenericCastToPtr :  SpecConstantOpOperandsOperand<122, [], [Kernel]>;
 defm PtrCastToGeneric :  SpecConstantOpOperandsOperand<121, [], [Kernel]>;
 defm Bitcast :  SpecConstantOpOperandsOperand<124, [], []>;
 defm QuantizeToF16 :  SpecConstantOpOperandsOperand<116, [], [Shader]>;
-// Arithmetic 
+// Arithmetic
 defm SNegate :  SpecConstantOpOperandsOperand<126, [], []>;
 defm Not :  SpecConstantOpOperandsOperand<200, [], []>;
 defm IAdd :  SpecConstantOpOperandsOperand<128, [], []>;
diff --git a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
index 5ba035682238b..2951a4bc695e2 100644
--- a/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVTargetMachine.cpp
@@ -244,7 +244,8 @@ static cl::opt<bool> SPVEnableNonSemanticDI(
     cl::Optional, cl::init(false));
 
 void SPIRVPassConfig::addPreEmitPass() {
-  if (SPVEnableNonSemanticDI) {
+  if (SPVEnableNonSemanticDI ||
+      getSPIRVTargetMachine().getTargetTriple().getVendor() == Triple::AMD) {
     addPass(createSPIRVEmitNonSemanticDIPass(&getTM<SPIRVTargetMachine>()));
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/CMakeLists.txt b/llvm/lib/Target/WebAssembly/CMakeLists.txt
index 1e83cbeac50d6..17df119d62709 100644
--- a/llvm/lib/Target/WebAssembly/CMakeLists.txt
+++ b/llvm/lib/Target/WebAssembly/CMakeLists.txt
@@ -10,6 +10,7 @@ tablegen(LLVM WebAssemblyGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM WebAssemblyGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM WebAssemblyGenMCCodeEmitter.inc -gen-emitter)
 tablegen(LLVM WebAssemblyGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM WebAssemblyGenSDNodeInfo.inc -gen-sd-node-info)
 tablegen(LLVM WebAssemblyGenSubtargetInfo.inc -gen-subtarget)
 
 add_public_tablegen_target(WebAssemblyCommonTableGen)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index 2666342d0c7b9..66ed8b078b808 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -46,7 +46,7 @@ class WebAssemblyFastISel final : public FastISel {
   // All possible address modes.
   class Address {
   public:
-    using BaseKind = enum { RegBase, FrameIndexBase };
+    enum BaseKind { RegBase, FrameIndexBase };
 
   private:
     BaseKind Kind = RegBase;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 37a34573bb339..9fef3e6d8b089 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -24,6 +24,7 @@
 
 #include "WebAssembly.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
@@ -114,6 +115,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
   Wrapper->setAttributes(F->getAttributes());
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
   const DataLayout &DL = BB->getDataLayout();
+  IRBuilder<> Builder(BB);
 
   // Determine what arguments to pass.
   SmallVector<Value *, 4> Args;
@@ -140,10 +142,7 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
       Args.push_back(&*AI);
     } else {
       if (CastInst::isBitOrNoopPointerCastable(ArgType, ParamType, DL)) {
-        Instruction *PtrCast =
-            CastInst::CreateBitOrPointerCast(AI, ParamType, "cast");
-        PtrCast->insertInto(BB, BB->end());
-        Args.push_back(PtrCast);
+        Args.push_back(Builder.CreateBitOrPointerCast(AI, ParamType, "cast"));
       } else if (ArgType->isStructTy() || ParamType->isStructTy()) {
         LLVM_DEBUG(dbgs() << "createWrapper: struct param type in bitcast: "
                           << F->getName() << "\n");
@@ -166,24 +165,19 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
       for (; AI != AE; ++AI)
         Args.push_back(&*AI);
 
-    CallInst *Call = CallInst::Create(F, Args, "", BB);
+    CallInst *Call = Builder.CreateCall(F, Args);
 
-    Type *ExpectedRtnType = F->getFunctionType()->getReturnType();
-    Type *RtnType = Ty->getReturnType();
     // Determine what value to return.
     if (RtnType->isVoidTy()) {
-      ReturnInst::Create(M->getContext(), BB);
+      Builder.CreateRetVoid();
     } else if (ExpectedRtnType->isVoidTy()) {
       LLVM_DEBUG(dbgs() << "Creating dummy return: " << *RtnType << "\n");
-      ReturnInst::Create(M->getContext(), PoisonValue::get(RtnType), BB);
+      Builder.CreateRet(PoisonValue::get(RtnType));
     } else if (RtnType == ExpectedRtnType) {
-      ReturnInst::Create(M->getContext(), Call, BB);
+      Builder.CreateRet(Call);
     } else if (CastInst::isBitOrNoopPointerCastable(ExpectedRtnType, RtnType,
                                                     DL)) {
-      Instruction *Cast =
-          CastInst::CreateBitOrPointerCast(Call, RtnType, "cast");
-      Cast->insertInto(BB, BB->end());
-      ReturnInst::Create(M->getContext(), Cast, BB);
+      Builder.CreateRet(Builder.CreateBitOrPointerCast(Call, RtnType, "cast"));
     } else if (RtnType->isStructTy() || ExpectedRtnType->isStructTy()) {
       LLVM_DEBUG(dbgs() << "createWrapper: struct return type in bitcast: "
                         << F->getName() << "\n");
@@ -203,9 +197,8 @@ static Function *createWrapper(Function *F, FunctionType *Ty) {
     Wrapper = Function::Create(Ty, Function::PrivateLinkage,
                                F->getName() + "_bitcast_invalid", M);
     Wrapper->setAttributes(F->getAttributes());
-    BasicBlock *BB = BasicBlock::Create(M->getContext(), "body", Wrapper);
-    new UnreachableInst(M->getContext(), BB);
-    Wrapper->setName(F->getName() + "_bitcast_invalid");
+    IRBuilder<> Builder(BasicBlock::Create(M->getContext(), "body", Wrapper));
+    Builder.CreateUnreachable();
   } else if (!WrapperNeeded) {
     LLVM_DEBUG(dbgs() << "createWrapper: no wrapper needed: " << F->getName()
                       << "\n");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
deleted file mode 100644
index 23108e429eda8..0000000000000
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ /dev/null
@@ -1,64 +0,0 @@
-//- WebAssemblyISD.def - WebAssembly ISD ---------------------------*- C++ -*-//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file describes the various WebAssembly ISD node types.
-///
-//===----------------------------------------------------------------------===//
-
-// NOTE: NO INCLUDE GUARD DESIRED!
-
-HANDLE_NODETYPE(CALL)
-HANDLE_NODETYPE(RET_CALL)
-HANDLE_NODETYPE(RETURN)
-HANDLE_NODETYPE(ARGUMENT)
-HANDLE_NODETYPE(LOCAL_GET)
-HANDLE_NODETYPE(LOCAL_SET)
-// A wrapper node for TargetExternalSymbol, TargetGlobalAddress, and MCSymbol
-HANDLE_NODETYPE(Wrapper)
-// A special node for TargetGlobalAddress used in PIC code for
-// __memory_base/__table_base relative access.
-HANDLE_NODETYPE(WrapperREL)
-HANDLE_NODETYPE(BR_IF)
-HANDLE_NODETYPE(BR_TABLE)
-HANDLE_NODETYPE(DOT)
-HANDLE_NODETYPE(EXT_ADD_PAIRWISE_U)
-HANDLE_NODETYPE(EXT_ADD_PAIRWISE_S)
-HANDLE_NODETYPE(SHUFFLE)
-HANDLE_NODETYPE(SWIZZLE)
-HANDLE_NODETYPE(VEC_SHL)
-HANDLE_NODETYPE(VEC_SHR_S)
-HANDLE_NODETYPE(VEC_SHR_U)
-HANDLE_NODETYPE(NARROW_U)
-HANDLE_NODETYPE(EXTEND_LOW_S)
-HANDLE_NODETYPE(EXTEND_LOW_U)
-HANDLE_NODETYPE(EXTEND_HIGH_S)
-HANDLE_NODETYPE(EXTEND_HIGH_U)
-HANDLE_NODETYPE(CONVERT_LOW_S)
-HANDLE_NODETYPE(CONVERT_LOW_U)
-HANDLE_NODETYPE(PROMOTE_LOW)
-HANDLE_NODETYPE(TRUNC_SAT_ZERO_S)
-HANDLE_NODETYPE(TRUNC_SAT_ZERO_U)
-HANDLE_NODETYPE(DEMOTE_ZERO)
-HANDLE_NODETYPE(I64_ADD128)
-HANDLE_NODETYPE(I64_SUB128)
-HANDLE_NODETYPE(I64_MUL_WIDE_S)
-HANDLE_NODETYPE(I64_MUL_WIDE_U)
-
-// Memory intrinsics
-HANDLE_NODETYPE(GLOBAL_GET)
-HANDLE_NODETYPE(GLOBAL_SET)
-HANDLE_NODETYPE(TABLE_GET)
-HANDLE_NODETYPE(TABLE_SET)
-
-// Bulk memory instructions. These follow LLVM's expected semantics of
-// supporting out-of-bounds pointers if the length is zero, by inserting
-// a branch around Wasm's `memory.copy` and `memory.fill`, which would
-// otherwise trap.
-HANDLE_NODETYPE(MEMCPY)
-HANDLE_NODETYPE(MEMSET)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 7ec463bdc3b84..af322982d5355 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -942,20 +942,6 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter(
   }
 }
 
-const char *
-WebAssemblyTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
-  case WebAssemblyISD::FIRST_NUMBER:
-    break;
-#define HANDLE_NODETYPE(NODE)                                                  \
-  case WebAssemblyISD::NODE:                                                   \
-    return "WebAssemblyISD::" #NODE;
-#include "WebAssemblyISD.def"
-#undef HANDLE_NODETYPE
-  }
-  return nullptr;
-}
-
 std::pair<unsigned, const TargetRegisterClass *>
 WebAssemblyTargetLowering::getRegForInlineAsmConstraint(
     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
@@ -1830,11 +1816,8 @@ SDValue WebAssemblyTargetLowering::LowerLoad(SDValue Op,
 
     SDValue Idx = DAG.getTargetConstant(*Local, Base, MVT::i32);
     EVT LocalVT = LN->getValueType(0);
-    SDValue LocalGet = DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, LocalVT,
-                                   {LN->getChain(), Idx});
-    SDValue Result = DAG.getMergeValues({LocalGet, LN->getChain()}, DL);
-    assert(Result->getNumValues() == 2 && "Loads must carry a chain!");
-    return Result;
+    return DAG.getNode(WebAssemblyISD::LOCAL_GET, DL, {LocalVT, MVT::Other},
+                       {LN->getChain(), Idx});
   }
 
   if (WebAssembly::isWasmVarAddressSpace(LN->getAddressSpace()))
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 472ec678534a4..f7052989b3c75 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -19,17 +19,6 @@
 
 namespace llvm {
 
-namespace WebAssemblyISD {
-
-enum NodeType : unsigned {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-#define HANDLE_NODETYPE(NODE) NODE,
-#include "WebAssemblyISD.def"
-#undef HANDLE_NODETYPE
-};
-
-} // end namespace WebAssemblyISD
-
 class WebAssemblySubtarget;
 
 class WebAssemblyTargetLowering final : public TargetLowering {
@@ -53,7 +42,6 @@ class WebAssemblyTargetLowering final : public TargetLowering {
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
-  const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index 2673c81eae40b..cf5cc41ea565b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -11,23 +11,31 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "WebAssemblySelectionDAGInfo.h"
 #include "WebAssemblyTargetMachine.h"
+
+#define GET_SDNODE_DESC
+#include "WebAssemblyGenSDNodeInfo.inc"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-selectiondag-info"
 
+WebAssemblySelectionDAGInfo::WebAssemblySelectionDAGInfo()
+    : SelectionDAGGenTargetInfo(WebAssemblyGenSDNodeInfo) {}
+
 WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor
 
-bool WebAssemblySelectionDAGInfo::isTargetMemoryOpcode(unsigned Opcode) const {
+const char *
+WebAssemblySelectionDAGInfo::getTargetNodeName(unsigned Opcode) const {
   switch (static_cast<WebAssemblyISD::NodeType>(Opcode)) {
-  default:
-    return false;
-  case WebAssemblyISD::GLOBAL_GET:
-  case WebAssemblyISD::GLOBAL_SET:
-  case WebAssemblyISD::TABLE_GET:
-  case WebAssemblyISD::TABLE_SET:
-    return true;
+  case WebAssemblyISD::CALL:
+    return "WebAssemblyISD::CALL";
+  case WebAssemblyISD::RET_CALL:
+    return "WebAssemblyISD::RET_CALL";
   }
+
+  return SelectionDAGGenTargetInfo::getTargetNodeName(Opcode);
 }
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy(
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 69c9af0966308..8775f4946d88d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -17,13 +17,26 @@
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 
+#define GET_SDNODE_ENUM
+#include "WebAssemblyGenSDNodeInfo.inc"
+
 namespace llvm {
+namespace WebAssemblyISD {
+
+enum NodeType : unsigned {
+  CALL = GENERATED_OPCODE_END,
+  RET_CALL,
+};
 
-class WebAssemblySelectionDAGInfo final : public SelectionDAGTargetInfo {
+} // namespace WebAssemblyISD
+
+class WebAssemblySelectionDAGInfo final : public SelectionDAGGenTargetInfo {
 public:
+  WebAssemblySelectionDAGInfo();
+
   ~WebAssemblySelectionDAGInfo() override;
 
-  bool isTargetMemoryOpcode(unsigned Opcode) const override;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 51b540a7a51d0..fa23656e23fc3 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -158,7 +158,16 @@ FunctionPass *createX86InsertX87waitPass();
 /// This pass optimizes arithmetic based on knowledge that is only used by
 /// a reduction sequence and is therefore safe to reassociate in interesting
 /// ways.
-FunctionPass *createX86PartialReductionPass();
+class X86PartialReductionPass : public PassInfoMixin<X86PartialReductionPass> {
+private:
+  const X86TargetMachine *TM;
+
+public:
+  X86PartialReductionPass(const X86TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+};
+
+FunctionPass *createX86PartialReductionLegacyPass();
 
 /// // Analyzes and emits pseudos to support Win x64 Unwind V2.
 FunctionPass *createX86WinEHUnwindV2Pass();
@@ -179,7 +188,18 @@ FunctionPass *createX86LowerAMXTypeLegacyPass();
 
 /// The pass transforms amx intrinsics to scalar operation if the function has
 /// optnone attribute or it is O0.
-FunctionPass *createX86LowerAMXIntrinsicsPass();
+class X86LowerAMXIntrinsicsPass
+    : public PassInfoMixin<X86LowerAMXIntrinsicsPass> {
+private:
+  const TargetMachine *TM;
+
+public:
+  X86LowerAMXIntrinsicsPass(const TargetMachine *TM) : TM(TM) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
+  static bool isRequired() { return true; }
+};
+
+FunctionPass *createX86LowerAMXIntrinsicsLegacyPass();
 
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   const X86Subtarget &,
@@ -220,7 +240,7 @@ void initializeX86LowerAMXIntrinsicsLegacyPassPass(PassRegistry &);
 void initializeX86LowerAMXTypeLegacyPassPass(PassRegistry &);
 void initializeX86LowerTileCopyPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
-void initializeX86PartialReductionPass(PassRegistry &);
+void initializeX86PartialReductionLegacyPass(PassRegistry &);
 void initializeX86PreTileConfigPass(PassRegistry &);
 void initializeX86ReturnThunksPass(PassRegistry &);
 void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index c0c7f5adf06ef..ddbd10d8f7eda 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -272,7 +272,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, MachineBasicBlock &MBB,
       const MachineOperand &Src2 = MI.getOperand(2);
       bool Is32BitReg = Opc == X86::ADD32ri_ND || Opc == X86::ADD32rr_ND;
       const MCInstrDesc &NewDesc =
-          ST.getInstrInfo()->get(Is32BitReg ? X86::LEA32r : X86::LEA64r);
+          ST.getInstrInfo()->get(Is32BitReg ? X86::LEA64_32r : X86::LEA64r);
       if (Is32BitReg)
         Src1 = getX86SubSuperRegister(Src1, 64);
       MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), NewDesc, Dst)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 133406bd8e0d7..4d44227b3ecd4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   // Combine sin / cos into _sincos_stret if it is available.
-  setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-  setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
 
   if (Subtarget.isTargetWin64()) {
     setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
   }
 
+  if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
+    // On AVX512BW, we can use variable 16-bit shifts to implement variable
+    // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
+    // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
+    // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
+    // can efficiently be merged together using a masked move.
+    MVT ExtVT = MVT::v32i16;
+
+    SDValue RLo, RHi;
+    // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and
+    // right shifting AmtHi.
+    SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt),
+                                DAG.getConstant(0x00ff, dl, ExtVT));
+    SDValue AmtHi = getTargetVShiftByConstNode(
+        X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
+    switch (Opc) {
+    case ISD::SHL:
+      // Because we shift left, no bits from the high half can influence the low
+      // half, so we don't need to mask RLo. We do however need to mask RHi, to
+      // prevent high bits of an even lane overflowing into low bits of an odd
+      // lane.
+      RLo = DAG.getBitcast(ExtVT, R);
+      RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
+                        DAG.getConstant(0xff00, dl, ExtVT));
+      break;
+    case ISD::SRL:
+      // Same idea as above, but this time we need to make sure no low bits of
+      // an odd lane can overflow into high bits of an even lane.
+      RHi = DAG.getBitcast(ExtVT, R);
+      RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
+                        DAG.getConstant(0x00ff, dl, ExtVT));
+      break;
+    case ISD::SRA:
+      // For arithmetic right shifts, we want to sign extend each even lane of R
+      // such that the upper half of the corresponding lane of RLo is 0 or -1
+      // depending on the sign bit of the original lane. We do this using 2
+      // immediate shifts.
+      RHi = DAG.getBitcast(ExtVT, R);
+      RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
+      RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
+      break;
+    default:
+      llvm_unreachable("Unexpected Shift Op");
+    }
+
+    SDValue ShiftedLo =
+        DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo));
+    SDValue ShiftedHi =
+        DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi));
+
+    // To merge the shifted vectors back together, we select even lanes
+    // from ShiftedLo and odd lanes from ShiftedHi.
+    SDValue SelectMask = DAG.getBitcast(
+        MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64));
+    return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi);
+  }
+
   if (VT == MVT::v16i8 ||
       (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
       (VT == MVT::v64i8 && Subtarget.hasBWI())) {
@@ -33004,60 +33061,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
 }
 
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
-                            SelectionDAG &DAG) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  bool isF64 = ArgVT == MVT::f64;
-
-  RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
-  const char *LibcallName = TLI.getLibcallName(LC);
-  if (!LibcallName)
-    return SDValue();
-
-  assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
-
-  // For MacOSX, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values as { float, float } (in XMM0) or
-  // { double, double } (which is returned in XMM0, XMM1).
-  SDLoc dl(Op);
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  TargetLowering::ArgListTy Args;
-  Args.emplace_back(Arg, ArgTy);
-
-  // Only optimize x86_64 for now. i386 is a bit messy. For f32,
-  // the small struct {f32, f32} is returned in (eax, edx). For f64,
-  // the results are returned via SRet in memory.
-  SDValue Callee =
-      DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
-
-  Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
-                      : (Type *)FixedVectorType::get(ArgTy, 4);
-
-  TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl)
-      .setChain(DAG.getEntryNode())
-      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
-
-  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
-  if (isF64)
-    // Returned in xmm0 and xmm1.
-    return CallResult.first;
-
-  // Returned in bits 0:31 and 32:64 xmm0.
-  SDValue SinVal =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
-                  DAG.getVectorIdxConstant(0, dl));
-  SDValue CosVal =
-      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
-                  DAG.getVectorIdxConstant(1, dl));
-  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
-  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
-}
-
 /// Widen a vector input to a vector of NVT.  The
 /// input vector must have the same element type as NVT.
 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33662,7 +33665,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ABDS:
   case ISD::ABDU:               return LowerABD(Op, Subtarget, DAG);
   case ISD::AVGCEILU:           return LowerAVG(Op, Subtarget, DAG);
-  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
   case ISD::MGATHER:            return LowerMGATHER(Op, Subtarget, DAG);
@@ -53347,6 +53349,112 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Look for a RMW operation that only touches one bit of a larger than legal
+// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
+// i32 sub value.
+static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
+                              SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
+  SDValue StoredVal = St->getValue();
+  EVT VT = StoredVal.getValueType();
+
+  // Only narrow normal stores of larger than legal scalar integers.
+  if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
+      VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
+    return SDValue();
+
+  // BTR: X & ~(1 << ShAmt)
+  // BTS: X | (1 << ShAmt)
+  // BTC: X ^ (1 << ShAmt)
+  //
+  // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
+  SDValue SrcVal, InsertBit, ShAmt;
+  if (!(sd_match(StoredVal, m_And(m_Value(SrcVal),
+                                  m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
+        sd_match(StoredVal,
+                 m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+        sd_match(StoredVal,
+                 m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+        sd_match(
+            StoredVal,
+            m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+                 m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+    return SDValue();
+
+  // SrcVal must be a matching normal load further up the chain.
+  auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+  if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+      Ld->getBasePtr() != St->getBasePtr() ||
+      Ld->getOffset() != St->getOffset() ||
+      !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
+    return SDValue();
+
+  // Ensure the shift amount is in bounds.
+  KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+  if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
+    return SDValue();
+
+  // If we're inserting a bit then it must be the LSB.
+  if (InsertBit) {
+    KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
+    if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
+      return SDValue();
+  }
+
+  // Split the shift into an alignment shift that moves the active i32 block to
+  // the bottom bits for truncation and a modulo shift that can act on the i32.
+  EVT AmtVT = ShAmt.getValueType();
+  SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+                                 DAG.getSignedConstant(-32LL, DL, AmtVT));
+  SDValue ModuloAmt =
+      DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+  ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
+
+  // Compute the byte offset for the i32 block that is changed by the RMW.
+  // combineTruncate will adjust the load for us in a similar way.
+  EVT PtrVT = St->getBasePtr().getValueType();
+  SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT);
+  SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+                                   DAG.getShiftAmountConstant(3, PtrVT, DL));
+  SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL,
+                                            SDNodeFlags::NoUnsignedWrap);
+
+  // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
+  SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
+  X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+
+  SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+                             DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
+
+  SDValue Res;
+  if (InsertBit) {
+    SDValue BitMask =
+        DAG.getNode(ISD::SHL, DL, MVT::i32,
+                    DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
+    Res =
+        DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
+  } else {
+    if (StoredVal.getOpcode() == ISD::AND)
+      Mask = DAG.getNOT(DL, Mask, MVT::i32);
+    Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+  }
+
+  SDValue NewStore =
+      DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+                   Align(), St->getMemOperand()->getFlags());
+
+  // If there are other uses of StoredVal, replace with a new load of the
+  // whole (updated) value.
+  if (!StoredVal.hasOneUse()) {
+    SDValue NewLoad =
+        DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+    DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
+  }
+  return NewStore;
+}
+
 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
@@ -53573,6 +53681,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
+    return R;
+
   // Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
   //         store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
   if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -54505,8 +54616,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   // truncation, see if we can convert the shift into a pointer offset instead.
   // Limit this to normal (non-ext) scalar integer loads.
   if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
-      Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
-      ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+      Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
+      (Src.getOperand(0).hasOneUse() ||
+       !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) {
     auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
     if (Ld->isSimple() && VT.isByteSized() &&
         isPowerOf2_64(VT.getSizeInBits())) {
@@ -54529,8 +54641,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
         SDValue NewLoad =
             DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
                         Align(), Ld->getMemOperand()->getFlags());
-        DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
-                                      NewLoad.getValue(1));
+        DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
         return NewLoad;
       }
     }
@@ -56306,6 +56417,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
+  using namespace SDPatternMatch;
   const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   const SDValue LHS = N->getOperand(0);
   const SDValue RHS = N->getOperand(1);
@@ -56364,6 +56476,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
       if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
         return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
 
+      // If we're performing a bit test on a larger than legal type, attempt
+      // to (aligned) shift down the value to the bottom 32-bits and then
+      // perform the bittest on the i32 value.
+      // ICMP_ZERO(AND(X,SHL(1,IDX)))
+      // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31))))
+      if (isNullConstant(RHS) &&
+          OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) {
+        SDValue X, ShAmt;
+        if (sd_match(LHS, m_OneUse(m_And(m_Value(X),
+                                         m_Shl(m_One(), m_Value(ShAmt)))))) {
+          // Only attempt this if the shift amount is known to be in bounds.
+          KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+          if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) {
+            EVT AmtVT = ShAmt.getValueType();
+            SDValue AlignAmt =
+                DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+                            DAG.getSignedConstant(-32LL, DL, AmtVT));
+            SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+                                            DAG.getConstant(31, DL, AmtVT));
+            SDValue Mask = DAG.getNode(
+                ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
+                DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
+            X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt);
+            X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+            X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask);
+            return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32),
+                                CC);
+          }
+        }
+      }
+
       // cmpeq(trunc(x),C) --> cmpeq(x,C)
       // cmpne(trunc(x),C) --> cmpne(x,C)
       // iff x upper bits are zero.
diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
index 7f3393910da2c..662aec2c15241 100644
--- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
+++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp
@@ -23,12 +23,15 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -40,7 +43,7 @@
 using namespace llvm;
 using namespace PatternMatch;
 
-#define DEBUG_TYPE "lower-amx-intrinsics"
+#define DEBUG_TYPE "x86-lower-amx-intrinsics"
 
 #ifndef NDEBUG
 static bool isV256I32Ty(Type *Ty) {
@@ -626,6 +629,37 @@ bool X86LowerAMXIntrinsics::visit() {
   return C;
 }
 
+namespace {
+bool shouldRunLowerAMXIntrinsics(const Function &F, const TargetMachine *TM) {
+  return X86ScalarizeAMX && (F.hasFnAttribute(Attribute::OptimizeNone) ||
+                             TM->getOptLevel() == CodeGenOptLevel::None);
+}
+
+bool runLowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) {
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+  X86LowerAMXIntrinsics LAT(F, DTU, LI);
+  return LAT.visit();
+}
+} // namespace
+
+PreservedAnalyses X86LowerAMXIntrinsicsPass::run(Function &F,
+                                                 FunctionAnalysisManager &FAM) {
+  if (!shouldRunLowerAMXIntrinsics(F, TM))
+    return PreservedAnalyses::all();
+
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+  bool Changed = runLowerAMXIntrinsics(F, &DT, &LI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<LoopAnalysis>();
+  return PA;
+}
+
 namespace {
 class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
 public:
@@ -634,21 +668,15 @@ class X86LowerAMXIntrinsicsLegacyPass : public FunctionPass {
   X86LowerAMXIntrinsicsLegacyPass() : FunctionPass(ID) {}
 
   bool runOnFunction(Function &F) override {
-    if (!X86ScalarizeAMX)
-      return false;
     TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
-    if (!F.hasFnAttribute(Attribute::OptimizeNone) &&
-        TM->getOptLevel() != CodeGenOptLevel::None)
+    if (!shouldRunLowerAMXIntrinsics(F, TM))
       return false;
 
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
     auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
     auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
-    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
-    X86LowerAMXIntrinsics LAT(F, DTU, LI);
-    return LAT.visit();
+    return runLowerAMXIntrinsics(F, DT, LI);
   }
   StringRef getPassName() const override { return "Lower AMX intrinsics"; }
 
@@ -668,6 +696,6 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName,
                     false, false)
 
-FunctionPass *llvm::createX86LowerAMXIntrinsicsPass() {
+FunctionPass *llvm::createX86LowerAMXIntrinsicsLegacyPass() {
   return new X86LowerAMXIntrinsicsLegacyPass();
 }
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
index a25e4e0f464a4..898c83cf9b468 100644
--- a/llvm/lib/Target/X86/X86PartialReduction.cpp
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -16,10 +16,12 @@
 #include "X86TargetMachine.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Analysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
@@ -30,39 +32,44 @@ using namespace llvm;
 
 namespace {
 
-class X86PartialReduction : public FunctionPass {
+class X86PartialReduction {
+  const X86TargetMachine *TM;
   const DataLayout *DL = nullptr;
   const X86Subtarget *ST = nullptr;
 
+public:
+  X86PartialReduction(const X86TargetMachine *TM) : TM(TM) {}
+  bool run(Function &F);
+
+private:
+  bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
+  bool trySADReplacement(Instruction *Op);
+};
+
+class X86PartialReductionLegacy : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
 
-  X86PartialReduction() : FunctionPass(ID) { }
+  X86PartialReductionLegacy() : FunctionPass(ID) {}
 
-  bool runOnFunction(Function &Fn) override;
+  bool runOnFunction(Function &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
   }
 
-  StringRef getPassName() const override {
-    return "X86 Partial Reduction";
-  }
-
-private:
-  bool tryMAddReplacement(Instruction *Op, bool ReduceInOneBB);
-  bool trySADReplacement(Instruction *Op);
+  StringRef getPassName() const override { return "X86 Partial Reduction"; }
 };
 }
 
-FunctionPass *llvm::createX86PartialReductionPass() {
-  return new X86PartialReduction();
+FunctionPass *llvm::createX86PartialReductionLegacyPass() {
+  return new X86PartialReductionLegacy();
 }
 
-char X86PartialReduction::ID = 0;
+char X86PartialReductionLegacy::ID = 0;
 
-INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
-                "X86 Partial Reduction", false, false)
+INITIALIZE_PASS(X86PartialReductionLegacy, DEBUG_TYPE, "X86 Partial Reduction",
+                false, false)
 
 // This function should be aligned with detectExtMul() in X86ISelLowering.cpp.
 static bool matchVPDPBUSDPattern(const X86Subtarget *ST, BinaryOperator *Mul,
@@ -494,17 +501,8 @@ static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
   }
 }
 
-bool X86PartialReduction::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-
-  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
-  if (!TPC)
-    return false;
-
-  auto &TM = TPC->getTM<X86TargetMachine>();
-  ST = TM.getSubtargetImpl(F);
-
+bool X86PartialReduction::run(Function &F) {
+  ST = TM->getSubtargetImpl(F);
   DL = &F.getDataLayout();
 
   bool MadeChange = false;
@@ -540,3 +538,25 @@ bool X86PartialReduction::runOnFunction(Function &F) {
 
   return MadeChange;
 }
+
+bool X86PartialReductionLegacy::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  return X86PartialReduction(&TPC->getTM<X86TargetMachine>()).run(F);
+}
+
+PreservedAnalyses X86PartialReductionPass::run(Function &F,
+                                               FunctionAnalysisManager &FAM) {
+  bool Changed = X86PartialReduction(TM).run(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA = PreservedAnalyses::none();
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/X86/X86PassRegistry.def b/llvm/lib/Target/X86/X86PassRegistry.def
index fc25d55d3059a..db255940f8829 100644
--- a/llvm/lib/Target/X86/X86PassRegistry.def
+++ b/llvm/lib/Target/X86/X86PassRegistry.def
@@ -15,14 +15,14 @@
 #ifndef FUNCTION_PASS
 #define FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
+FUNCTION_PASS("x86-lower-amx-intrinsics", X86LowerAMXIntrinsicsPass(this))
 FUNCTION_PASS("x86-lower-amx-type", X86LowerAMXTypePass(this))
+FUNCTION_PASS("x86-partial-reduction", X86PartialReductionPass(this))
 #undef FUNCTION_PASS
 
 #ifndef DUMMY_FUNCTION_PASS
 #define DUMMY_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
-DUMMY_FUNCTION_PASS("lower-amx-intrinsics", X86LowerAMXIntrinsics(*this))
-DUMMY_FUNCTION_PASS("x86-partial-reduction", X86PartialReduction())
 DUMMY_FUNCTION_PASS("x86-winehstate", WinEHStatePass())
 #undef DUMMY_FUNCTION_PASS
 
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 9a76abcd351bf..5f0bcab251e61 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -97,7 +97,7 @@ extern "C" LLVM_C_ABI void LLVMInitializeX86Target() {
   initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
   initializeX86LoadValueInjectionRetHardeningPassPass(PR);
   initializeX86OptimizeLEAPassPass(PR);
-  initializeX86PartialReductionPass(PR);
+  initializeX86PartialReductionLegacyPass(PR);
   initializePseudoProbeInserterPass(PR);
   initializeX86ReturnThunksPass(PR);
   initializeX86DAGToDAGISelLegacyPass(PR);
@@ -422,14 +422,14 @@ void X86PassConfig::addIRPasses() {
 
   // We add both pass anyway and when these two passes run, we skip the pass
   // based on the option level and option attribute.
-  addPass(createX86LowerAMXIntrinsicsPass());
+  addPass(createX86LowerAMXIntrinsicsLegacyPass());
   addPass(createX86LowerAMXTypeLegacyPass());
 
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() != CodeGenOptLevel::None) {
     addPass(createInterleavedAccessPass());
-    addPass(createX86PartialReductionPass());
+    addPass(createX86PartialReductionLegacyPass());
   }
 
   // Add passes that handle indirect branch removal and insertion of a retpoline
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index f6f7e92d98578..2f28ab36aa193 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -66,7 +66,7 @@ namespace {
                           MachineBasicBlock &MBB);
     void addDirtySuccessor(MachineBasicBlock &MBB);
 
-    using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
+    enum BlockExitState { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
 
     static const char* getBlockExitStateName(BlockExitState ST);
 
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 5ed47aec08b25..a6ac7610a2c7a 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -5185,6 +5185,7 @@ struct AADereferenceableCallSiteReturned final
 // ------------------------ Align Argument Attribute ------------------------
 
 namespace {
+
 static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
                                     Value &AssociatedValue, const Use *U,
                                     const Instruction *I, bool &TrackUse) {
@@ -5200,6 +5201,28 @@ static unsigned getKnownAlignForUse(Attributor &A, AAAlign &QueryingAA,
       TrackUse = true;
     return 0;
   }
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::ptrmask: {
+      // Is it appropriate to pull attribute in initialization?
+      const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+          QueryingAA, IRPosition::value(*II->getOperand(1)), DepClassTy::NONE);
+      const auto *AlignAA = A.getAAFor<AAAlign>(
+          QueryingAA, IRPosition::value(*II), DepClassTy::NONE);
+      if (ConstVals && ConstVals->isValidState() && ConstVals->isAtFixpoint()) {
+        unsigned ShiftValue = std::min(ConstVals->getAssumedMinTrailingZeros(),
+                                       Value::MaxAlignmentExponent);
+        Align ConstAlign(UINT64_C(1) << ShiftValue);
+        if (ConstAlign >= AlignAA->getKnownAlign())
+          return Align(1).value();
+      }
+      if (AlignAA)
+        return AlignAA->getKnownAlign().value();
+      break;
+    }
+    default:
+      break;
+    }
 
   MaybeAlign MA;
   if (const auto *CB = dyn_cast<CallBase>(I)) {
@@ -5499,6 +5522,44 @@ struct AAAlignCallSiteReturned final
   AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
       : Base(IRP, A) {}
 
+  ChangeStatus updateImpl(Attributor &A) override {
+    Instruction *I = getIRPosition().getCtxI();
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::ptrmask: {
+        Align Alignment;
+        bool Valid = false;
+
+        const auto *ConstVals = A.getAAFor<AAPotentialConstantValues>(
+            *this, IRPosition::value(*II->getOperand(1)), DepClassTy::REQUIRED);
+        if (ConstVals && ConstVals->isValidState()) {
+          unsigned ShiftValue =
+              std::min(ConstVals->getAssumedMinTrailingZeros(),
+                       Value::MaxAlignmentExponent);
+          Alignment = Align(UINT64_C(1) << ShiftValue);
+          Valid = true;
+        }
+
+        const auto *AlignAA =
+            A.getAAFor<AAAlign>(*this, IRPosition::value(*(II->getOperand(0))),
+                                DepClassTy::REQUIRED);
+        if (AlignAA && AlignAA->isValidState()) {
+          Alignment = std::max(AlignAA->getAssumedAlign(), Alignment);
+          Valid = true;
+        }
+
+        if (Valid)
+          return clampStateAndIndicateChange<StateType>(
+              this->getState(),
+              std::min(this->getAssumedAlign(), Alignment).value());
+        break;
+      }
+      default:
+        break;
+      }
+    }
+    return Base::updateImpl(A);
+  };
   /// See AbstractAttribute::trackStatistics()
   void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
 };
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index f5130da818746..9572f9d702e1b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3599,6 +3599,21 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
                                  m_Not(m_Specific(SelCond->getTrueValue())));
       if (MayNeedFreeze)
         C = Builder.CreateFreeze(C);
+      if (!ProfcheckDisableMetadataFixes) {
+        Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr;
+        if (match(CondVal, m_LogicalAnd(m_Specific(C), m_Value(A2))) &&
+            SelCond) {
+          return SelectInst::Create(C, A, B, "", nullptr, SelCond);
+        } else if (match(FalseVal,
+                         m_LogicalAnd(m_Not(m_Value(C2)), m_Value(B2))) &&
+                   SelFVal) {
+          SelectInst *NewSI = SelectInst::Create(C, A, B, "", nullptr, SelFVal);
+          NewSI->swapProfMetadata();
+          return NewSI;
+        } else {
+          return createSelectInstWithUnknownProfile(C, A, B);
+        }
+      }
       return SelectInst::Create(C, A, B);
     }
 
@@ -3615,6 +3630,20 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
                                  m_Not(m_Specific(SelFVal->getTrueValue())));
       if (MayNeedFreeze)
         C = Builder.CreateFreeze(C);
+      if (!ProfcheckDisableMetadataFixes) {
+        Value *C2 = nullptr, *A2 = nullptr, *B2 = nullptr;
+        if (match(CondVal, m_LogicalAnd(m_Not(m_Value(C2)), m_Value(A2))) &&
+            SelCond) {
+          SelectInst *NewSI = SelectInst::Create(C, B, A, "", nullptr, SelCond);
+          NewSI->swapProfMetadata();
+          return NewSI;
+        } else if (match(FalseVal, m_LogicalAnd(m_Specific(C), m_Value(B2))) &&
+                   SelFVal) {
+          return SelectInst::Create(C, B, A, "", nullptr, SelFVal);
+        } else {
+          return createSelectInstWithUnknownProfile(C, B, A);
+        }
+      }
       return SelectInst::Create(C, B, A);
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 67f837c7ed968..b158e0f626850 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2261,11 +2261,11 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
 }
 
 Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
-  if (!isa<Constant>(I.getOperand(1)))
-    return nullptr;
+  bool IsOtherParamConst = isa<Constant>(I.getOperand(1));
 
   if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
-    if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+    if (Instruction *NewSel =
+            FoldOpIntoSelect(I, Sel, false, !IsOtherParamConst))
       return NewSel;
   } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
     if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 471c6ec633a57..ceeece41782f4 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -3903,7 +3903,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //          adding/"accumulating" %s. "Accumulation" stores the result in one
   //          of the source registers, but this accumulate vs. add distinction
   //          is lost when dealing with LLVM intrinsics.)
+  //
+  // ZeroPurifies means that multiplying a known-zero with an uninitialized
+  // value results in an initialized value. This is applicable for integer
+  // multiplication, but not floating-point (counter-example: NaN).
   void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor,
+                                  bool ZeroPurifies,
                                   unsigned EltSizeInBits = 0) {
     IRBuilder<> IRB(&I);
 
@@ -3945,7 +3950,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(AccumulatorType == ReturnType);
     }
 
-    FixedVectorType *ImplicitReturnType = ReturnType;
+    FixedVectorType *ImplicitReturnType =
+        cast<FixedVectorType>(getShadowTy(ReturnType));
     // Step 1: instrument multiplication of corresponding vector elements
     if (EltSizeInBits) {
       ImplicitReturnType = cast<FixedVectorType>(
@@ -3964,30 +3970,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
              ReturnType->getNumElements() * ReductionFactor);
     }
 
-    // Multiplying an *initialized* zero by an uninitialized element results in
-    // an initialized zero element.
-    //
-    // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
-    // results in an unpoisoned value. We can therefore adapt the visitAnd()
-    // instrumentation:
-    //   OutShadow =   (SaNonZero & SbNonZero)
-    //               | (VaNonZero & SbNonZero)
-    //               | (SaNonZero & VbNonZero)
-    //   where non-zero is checked on a per-element basis (not per bit).
-    Value *SZero = Constant::getNullValue(Va->getType());
-    Value *VZero = Constant::getNullValue(Sa->getType());
-    Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero);
-    Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero);
-    Value *VaNonZero = IRB.CreateICmpNE(Va, VZero);
-    Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero);
-
-    Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
-    Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
-    Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
-
     // Each element of the vector is represented by a single bit (poisoned or
     // not) e.g., <8 x i1>.
-    Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+    Value *SaNonZero = IRB.CreateIsNotNull(Sa);
+    Value *SbNonZero = IRB.CreateIsNotNull(Sb);
+    Value *And;
+    if (ZeroPurifies) {
+      // Multiplying an *initialized* zero by an uninitialized element results
+      // in an initialized zero element.
+      //
+      // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value
+      // results in an unpoisoned value. We can therefore adapt the visitAnd()
+      // instrumentation:
+      //   OutShadow =   (SaNonZero & SbNonZero)
+      //               | (VaNonZero & SbNonZero)
+      //               | (SaNonZero & VbNonZero)
+      //   where non-zero is checked on a per-element basis (not per bit).
+      Value *VaInt = Va;
+      Value *VbInt = Vb;
+      if (!Va->getType()->isIntegerTy()) {
+        VaInt = CreateAppToShadowCast(IRB, Va);
+        VbInt = CreateAppToShadowCast(IRB, Vb);
+      }
+
+      Value *VaNonZero = IRB.CreateIsNotNull(VaInt);
+      Value *VbNonZero = IRB.CreateIsNotNull(VbInt);
+
+      Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero);
+      Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero);
+      Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero);
+
+      And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero});
+    } else {
+      And = IRB.CreateOr({SaNonZero, SbNonZero});
+    }
 
     // Extend <8 x i1> to <8 x i16>.
     // (The real pmadd intrinsic would have computed intermediate values of
@@ -5752,17 +5768,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
     case Intrinsic::x86_avx2_pmadd_ub_sw:
     case Intrinsic::x86_avx512_pmaddubs_w_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true);
       break;
 
     // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_ssse3_pmadd_ub_sw:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
       break;
 
     // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>)
     case Intrinsic::x86_mmx_pmadd_wd:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
 
     // AVX Vector Neural Network Instructions: bytes
@@ -5848,7 +5867,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx2_vpdpbuuds_128:
     case Intrinsic::x86_avx2_vpdpbuuds_256:
     case Intrinsic::x86_avx10_vpdpbuuds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4, /*EltSize=*/8);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/4,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/8);
       break;
 
     // AVX Vector Neural Network Instructions: words
@@ -5901,7 +5921,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
-      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16);
+      handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
+                                 /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
 
       // TODO: Dot Product of BF16 Pairs Accumulated Into Packed Single
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index 19eccb9e17020..9ffa602416b05 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1796,14 +1796,16 @@ struct LoopFuser {
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
-    // Forget block dispositions as well, so that there are no dangling
-    // pointers to erased/free'ed blocks.
-    SE.forgetBlockAndLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
     mergeLatch(FC0, FC1);
 
+    // Forget block dispositions as well, so that there are no dangling
+    // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+    // since merging the latches may affect the dispositions.
+    SE.forgetBlockAndLoopDispositions();
+
     // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
     for (BasicBlock *BB : Blocks) {
@@ -2092,14 +2094,16 @@ struct LoopFuser {
     // mergeLatch may remove the only block in FC1.
     SE.forgetLoop(FC1.L);
     SE.forgetLoop(FC0.L);
-    // Forget block dispositions as well, so that there are no dangling
-    // pointers to erased/free'ed blocks.
-    SE.forgetBlockAndLoopDispositions();
 
     // Move instructions from FC0.Latch to FC1.Latch.
     // Note: mergeLatch requires an updated DT.
     mergeLatch(FC0, FC1);
 
+    // Forget block dispositions as well, so that there are no dangling
+    // pointers to erased/free'ed blocks. It should be done after mergeLatch()
+    // since merging the latches may affect the dispositions.
+    SE.forgetBlockAndLoopDispositions();
+
     // Merge the loops.
     SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
     for (BasicBlock *BB : Blocks) {
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index a8839981e5478..1b770be3909a9 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -89,8 +89,8 @@ struct StoreToLoadForwardingCandidate {
   /// Return true if the dependence from the store to the load has an
   /// absolute distance of one.
   /// E.g. A[i+1] = A[i] (or A[i-1] = A[i] for descending loop)
-  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
-                                 Loop *L) const {
+  bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE, Loop *L,
+                                 const DominatorTree &DT) const {
     Value *LoadPtr = Load->getPointerOperand();
     Value *StorePtr = Store->getPointerOperand();
     Type *LoadType = getLoadStoreType(Load);
@@ -102,8 +102,10 @@ struct StoreToLoadForwardingCandidate {
                DL.getTypeSizeInBits(getLoadStoreType(Store)) &&
            "Should be a known dependence");
 
-    int64_t StrideLoad = getPtrStride(PSE, LoadType, LoadPtr, L).value_or(0);
-    int64_t StrideStore = getPtrStride(PSE, LoadType, StorePtr, L).value_or(0);
+    int64_t StrideLoad =
+        getPtrStride(PSE, LoadType, LoadPtr, L, DT).value_or(0);
+    int64_t StrideStore =
+        getPtrStride(PSE, LoadType, StorePtr, L, DT).value_or(0);
     if (!StrideLoad || !StrideStore || StrideLoad != StrideStore)
       return false;
 
@@ -287,8 +289,8 @@ class LoadEliminationForLoop {
         // so deciding which one forwards is easy.  The later one forwards as
         // long as they both have a dependence distance of one to the load.
         if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
-            Cand.isDependenceDistanceOfOne(PSE, L) &&
-            OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+            Cand.isDependenceDistanceOfOne(PSE, L, *DT) &&
+            OtherCand->isDependenceDistanceOfOne(PSE, L, *DT)) {
           // They are in the same block, the later one will forward to the load.
           if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
             OtherCand = &Cand;
@@ -538,7 +540,7 @@ class LoadEliminationForLoop {
 
       // Check whether the SCEV difference is the same as the induction step,
       // thus we load the value in the next iteration.
-      if (!Cand.isDependenceDistanceOfOne(PSE, L))
+      if (!Cand.isDependenceDistanceOfOne(PSE, L, *DT))
         continue;
 
       assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 2bda9d83236e8..802ae4e9c28e3 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1327,7 +1327,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
   }
 
   // Do not attempt partial/runtime unrolling in FullLoopUnrolling
-  if (OnlyFullUnroll && (UP.Count < TripCount || UP.Count < MaxTripCount)) {
+  if (OnlyFullUnroll && ((!TripCount && !MaxTripCount) ||
+                         UP.Count < TripCount || UP.Count < MaxTripCount)) {
     LLVM_DEBUG(
         dbgs() << "Not attempting partial/runtime unroll in FullLoopUnroll.\n");
     return LoopUnrollResult::Unmodified;
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index bb6c879f4d47e..0577ddbd2353c 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -40,6 +40,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
@@ -337,7 +338,7 @@ static void buildPartialUnswitchConditionalBranch(
 static void buildPartialInvariantUnswitchConditionalBranch(
     BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction,
     BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L,
-    MemorySSAUpdater *MSSAU) {
+    MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) {
   ValueToValueMapTy VMap;
   for (auto *Val : reverse(ToDuplicate)) {
     Instruction *Inst = cast<Instruction>(Val);
@@ -377,8 +378,19 @@ static void buildPartialInvariantUnswitchConditionalBranch(
   IRBuilder<> IRB(&BB);
   IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated());
   Value *Cond = VMap[ToDuplicate[0]];
-  IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
-                   Direction ? &NormalSucc : &UnswitchedSucc);
+  // The expectation is that ToDuplicate[0] is the condition used by the
+  // OriginalBranch, case in which we can clone the profile metadata from there.
+  auto *ProfData =
+      !ProfcheckDisableMetadataFixes &&
+              ToDuplicate[0] == skipTrivialSelect(OriginalBranch.getCondition())
+          ? OriginalBranch.getMetadata(LLVMContext::MD_prof)
+          : nullptr;
+  auto *BR =
+      IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+                       Direction ? &NormalSucc : &UnswitchedSucc, ProfData);
+  if (!ProfData)
+    setExplicitlyUnknownBranchWeightsIfProfiled(*BR, *BR->getFunction(),
+                                                DEBUG_TYPE);
 }
 
 /// Rewrite the PHI nodes in an unswitched loop exit basic block.
@@ -2515,7 +2527,7 @@ static void unswitchNontrivialInvariants(
     // the branch in the split block.
     if (PartiallyInvariant)
       buildPartialInvariantUnswitchConditionalBranch(
-          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU);
+          *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH, L, MSSAU, *BI);
     else {
       buildPartialUnswitchConditionalBranch(
           *SplitBB, Invariants, Direction, *ClonedPH, *LoopPH,
@@ -2820,9 +2832,14 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
      MSSAU->getMemorySSA()->verifyMemorySSA();
 
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  Instruction *DeoptBlockTerm =
-      SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true,
-                                GI->getMetadata(LLVMContext::MD_prof), &DTU, &LI);
+  // llvm.experimental.guard doesn't have branch weights. We can assume,
+  // however, that the deopt path is unlikely.
+  Instruction *DeoptBlockTerm = SplitBlockAndInsertIfThen(
+      GI->getArgOperand(0), GI, true,
+      !ProfcheckDisableMetadataFixes && EstimateProfile
+          ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
+          : nullptr,
+      &DTU, &LI);
   BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
   // SplitBlockAndInsertIfThen inserts control flow that branches to
   // DeoptBlockTerm if the condition is true.  We want the opposite.
@@ -3186,10 +3203,15 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
   Builder.SetInsertPoint(TI);
   auto *InvariantBr =
       Builder.CreateCondBr(InjectedCond, InLoopSucc, CheckBlock);
+  // We don't know anything about the relation between the limits.
+  setExplicitlyUnknownBranchWeightsIfProfiled(
+      *InvariantBr, *InvariantBr->getParent()->getParent(), DEBUG_TYPE);
 
   Builder.SetInsertPoint(CheckBlock);
-  Builder.CreateCondBr(TI->getCondition(), TI->getSuccessor(0),
-                       TI->getSuccessor(1));
+  Builder.CreateCondBr(
+      TI->getCondition(), TI->getSuccessor(0), TI->getSuccessor(1),
+      !ProfcheckDisableMetadataFixes ? TI->getMetadata(LLVMContext::MD_prof)
+                                     : nullptr);
   TI->eraseFromParent();
 
   // Fixup phis.
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5f6f66a4bc213..0a8f5ea2fdae1 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -558,11 +558,10 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
   } else {
     // Test for successors as back edge
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
-    BranchInst *Term = cast<BranchInst>(BB->getTerminator());
-
-    for (BasicBlock *Succ : Term->successors())
-      if (Visited.count(Succ))
-        Loops[Succ] = BB;
+    if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
+      for (BasicBlock *Succ : Term->successors())
+        if (Visited.count(Succ))
+          Loops[Succ] = BB;
   }
 }
 
@@ -594,7 +593,7 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 
   for (BasicBlock *P : predecessors(BB)) {
     // Ignore it if it's a branch from outside into our region entry
-    if (!ParentRegion->contains(P))
+    if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
       continue;
 
     Region *R = RI->getRegionFor(P);
@@ -1402,13 +1401,17 @@ bool StructurizeCFG::makeUniformRegion(Region *R, UniformityInfo &UA) {
 /// Run the transformation for each region found
 bool StructurizeCFG::run(Region *R, DominatorTree *DT,
                          const TargetTransformInfo *TTI) {
-  if (R->isTopLevelRegion())
+  // CallBr and its corresponding direct target blocks are for now ignored by
+  // this pass. This is not a limitation for the currently intended uses cases
+  // of callbr in the AMDGPU backend.
+  // Parent and child regions are not affected by this (current) restriction.
+  // See `llvm/test/Transforms/StructurizeCFG/callbr.ll` for details.
+  if (R->isTopLevelRegion() || isa<CallBrInst>(R->getEntry()->getTerminator()))
     return false;
 
   this->DT = DT;
   this->TTI = TTI;
   Func = R->getEntry()->getParent();
-  assert(hasOnlySimpleTerminator(*Func) && "Unsupported block terminator.");
 
   ParentRegion = R;
 
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 5ba6f95f5fae8..608661583c3db 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -933,6 +933,7 @@ Function *CodeExtractor::constructFunctionDeclaration(
       case Attribute::CoroDestroyOnlyWhenComplete:
       case Attribute::CoroElideSafe:
       case Attribute::NoDivergenceSource:
+      case Attribute::NoCreateUndefOrPoison:
         continue;
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
diff --git a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
index 0642d51cd2c21..6d4436b92c119 100644
--- a/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
+++ b/llvm/lib/Transforms/Utils/DeclareRuntimeLibcalls.cpp
@@ -16,22 +16,62 @@
 
 using namespace llvm;
 
+static void mergeAttributes(LLVMContext &Ctx, const Module &M,
+                            const DataLayout &DL, const Triple &TT,
+                            Function *Func, FunctionType *FuncTy,
+                            AttributeList FuncAttrs) {
+  AttributeList OldAttrs = Func->getAttributes();
+  AttributeList NewAttrs = OldAttrs;
+
+  {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getFnAttrs());
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getFnAttrs());
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addFnAttributes(Ctx, OldBuilder);
+  }
+
+  {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getRetAttrs());
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getRetAttrs());
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addRetAttributes(Ctx, OldBuilder);
+  }
+
+  for (unsigned I = 0, E = FuncTy->getNumParams(); I != E; ++I) {
+    AttrBuilder OldBuilder(Ctx, OldAttrs.getParamAttrs(I));
+    AttrBuilder NewBuilder(Ctx, FuncAttrs.getParamAttrs(I));
+    OldBuilder.merge(NewBuilder);
+    NewAttrs = NewAttrs.addParamAttributes(Ctx, I, OldBuilder);
+  }
+
+  Func->setAttributes(NewAttrs);
+}
+
 PreservedAnalyses DeclareRuntimeLibcallsPass::run(Module &M,
                                                   ModuleAnalysisManager &MAM) {
   RTLIB::RuntimeLibcallsInfo RTLCI(M.getTargetTriple());
   LLVMContext &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+  const Triple &TT = M.getTargetTriple();
 
   for (RTLIB::LibcallImpl Impl : RTLCI.getLibcallImpls()) {
     if (Impl == RTLIB::Unsupported)
       continue;
 
-    // TODO: Declare with correct type, calling convention, and attributes.
+    auto [FuncTy, FuncAttrs] = RTLCI.getFunctionTy(Ctx, TT, DL, Impl);
 
-    FunctionType *FuncTy =
-        FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
+    // TODO: Declare with correct type, calling convention, and attributes.
+    if (!FuncTy)
+      FuncTy = FunctionType::get(Type::getVoidTy(Ctx), {}, /*IsVarArgs=*/true);
 
     StringRef FuncName = RTLCI.getLibcallImplName(Impl);
-    M.getOrInsertFunction(FuncName, FuncTy);
+
+    Function *Func =
+        cast<Function>(M.getOrInsertFunction(FuncName, FuncTy).getCallee());
+    if (Func->getFunctionType() == FuncTy) {
+      mergeAttributes(Ctx, M, DL, TT, Func, FuncTy, FuncAttrs);
+      Func->setCallingConv(RTLCI.getLibcallImplCallingConv(Impl));
+    }
   }
 
   return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 46f29030ddb05..a03cf6e953e35 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3416,7 +3416,11 @@ DIExpression *llvm::getExpressionForConstant(DIBuilder &DIB, const Constant &C,
   // Create integer constant expression.
   auto createIntegerExpression = [&DIB](const Constant &CV) -> DIExpression * {
     const APInt &API = cast<ConstantInt>(&CV)->getValue();
-    std::optional<int64_t> InitIntOpt = API.trySExtValue();
+    std::optional<int64_t> InitIntOpt;
+    if (API.getBitWidth() == 1)
+      InitIntOpt = API.tryZExtValue();
+    else
+      InitIntOpt = API.trySExtValue();
     return InitIntOpt ? DIB.createConstantValueExpression(
                             static_cast<uint64_t>(*InitIntOpt))
                       : nullptr;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 1e8f6cc76900c..6c9467bf4a005 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -202,6 +202,27 @@ static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
 /// probability of executing at least one more iteration?
 static BranchProbability
 probOfNextInRemainder(BranchProbability OriginalLoopProb, unsigned N) {
+  // OriginalLoopProb == 1 would produce a division by zero in the calculation
+  // below.  The problem is that case indicates an always infinite loop, but a
+  // remainder loop cannot be calculated at run time if the original loop is
+  // infinite as infinity % UnrollCount is undefined.  We then choose
+  // probabilities indicating that all remainder loop iterations will always
+  // execute.
+  //
+  // Currently, the remainder loop here is an epilogue, which cannot be reached
+  // if the original loop is infinite, so the aforementioned choice is
+  // arbitrary.
+  //
+  // FIXME: Branch weights still need to be fixed in the case of prologues
+  // (issue #135812).  In that case, the aforementioned choice seems reasonable
+  // for the goal of maintaining the original loop's block frequencies.  That
+  // is, an infinite loop's initial iterations are not skipped, and the prologue
+  // loop body might have unique blocks that execute a finite number of times
+  // if, for example, the original loop body contains conditionals like i <
+  // UnrollCount.
+  if (OriginalLoopProb == BranchProbability::getOne())
+    return BranchProbability::getOne();
+
   // Each of these variables holds the original loop's probability that the
   // number of iterations it will execute is some m in the specified range.
   BranchProbability ProbOne = OriginalLoopProb;                // 1 <= m
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 8be471bee5579..6e60b94be78e3 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -992,9 +992,12 @@ BranchProbability llvm::getBranchProbability(BranchInst *B,
   uint64_t Weight0, Weight1;
   if (!extractBranchWeights(*B, Weight0, Weight1))
     return BranchProbability::getUnknown();
+  uint64_t Denominator = Weight0 + Weight1;
+  if (Denominator == 0)
+    return BranchProbability::getUnknown();
   if (!ForFirstTarget)
     std::swap(Weight0, Weight1);
-  return BranchProbability::getBranchProbability(Weight0, Weight0 + Weight1);
+  return BranchProbability::getBranchProbability(Weight0, Denominator);
 }
 
 bool llvm::setBranchProbability(BranchInst *B, BranchProbability P,
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index cbc604e87cf1a..3a3e3ade20212 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -778,8 +778,10 @@ struct ConstantComparesGatherer {
       return false;
 
     // Add all values from the range to the set
-    for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+    APInt Tmp = Span.getLower();
+    do
       Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+    while (++Tmp != Span.getUpper());
 
     UsedICmps++;
     return true;
@@ -6020,6 +6022,8 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
                                      const DataLayout &DL) {
   Value *Cond = SI->getCondition();
   KnownBits Known = computeKnownBits(Cond, DL, AC, SI);
+  SmallPtrSet<const Constant *, 4> KnownValues;
+  bool IsKnownValuesValid = collectPossibleValues(Cond, KnownValues, 4);
 
   // We can also eliminate cases by determining that their values are outside of
   // the limited range of the condition based on how many significant (non-sign)
@@ -6039,15 +6043,18 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
         UniqueSuccessors.push_back(Successor);
       ++It->second;
     }
-    const APInt &CaseVal = Case.getCaseValue()->getValue();
+    ConstantInt *CaseC = Case.getCaseValue();
+    const APInt &CaseVal = CaseC->getValue();
     if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
-        (CaseVal.getSignificantBits() > MaxSignificantBitsInCond)) {
-      DeadCases.push_back(Case.getCaseValue());
+        (CaseVal.getSignificantBits() > MaxSignificantBitsInCond) ||
+        (IsKnownValuesValid && !KnownValues.contains(CaseC))) {
+      DeadCases.push_back(CaseC);
       if (DTU)
         --NumPerSuccessorCases[Successor];
       LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
                         << " is dead.\n");
-    }
+    } else if (IsKnownValuesValid)
+      KnownValues.erase(CaseC);
   }
 
   // If we can prove that the cases must cover all possible values, the
@@ -6058,33 +6065,41 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
   const unsigned NumUnknownBits =
       Known.getBitWidth() - (Known.Zero | Known.One).popcount();
   assert(NumUnknownBits <= Known.getBitWidth());
-  if (HasDefault && DeadCases.empty() &&
-      NumUnknownBits < 64 /* avoid overflow */) {
-    uint64_t AllNumCases = 1ULL << NumUnknownBits;
-    if (SI->getNumCases() == AllNumCases) {
+  if (HasDefault && DeadCases.empty()) {
+    if (IsKnownValuesValid && all_of(KnownValues, IsaPred<UndefValue>)) {
       createUnreachableSwitchDefault(SI, DTU);
       return true;
     }
-    // When only one case value is missing, replace default with that case.
-    // Eliminating the default branch will provide more opportunities for
-    // optimization, such as lookup tables.
-    if (SI->getNumCases() == AllNumCases - 1) {
-      assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
-      IntegerType *CondTy = cast<IntegerType>(Cond->getType());
-      if (CondTy->getIntegerBitWidth() > 64 ||
-          !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
-        return false;
 
-      uint64_t MissingCaseVal = 0;
-      for (const auto &Case : SI->cases())
-        MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
-      auto *MissingCase =
-          cast<ConstantInt>(ConstantInt::get(Cond->getType(), MissingCaseVal));
-      SwitchInstProfUpdateWrapper SIW(*SI);
-      SIW.addCase(MissingCase, SI->getDefaultDest(), SIW.getSuccessorWeight(0));
-      createUnreachableSwitchDefault(SI, DTU, /*RemoveOrigDefaultBlock*/ false);
-      SIW.setSuccessorWeight(0, 0);
-      return true;
+    if (NumUnknownBits < 64 /* avoid overflow */) {
+      uint64_t AllNumCases = 1ULL << NumUnknownBits;
+      if (SI->getNumCases() == AllNumCases) {
+        createUnreachableSwitchDefault(SI, DTU);
+        return true;
+      }
+      // When only one case value is missing, replace default with that case.
+      // Eliminating the default branch will provide more opportunities for
+      // optimization, such as lookup tables.
+      if (SI->getNumCases() == AllNumCases - 1) {
+        assert(NumUnknownBits > 1 && "Should be canonicalized to a branch");
+        IntegerType *CondTy = cast<IntegerType>(Cond->getType());
+        if (CondTy->getIntegerBitWidth() > 64 ||
+            !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+          return false;
+
+        uint64_t MissingCaseVal = 0;
+        for (const auto &Case : SI->cases())
+          MissingCaseVal ^= Case.getCaseValue()->getValue().getLimitedValue();
+        auto *MissingCase = cast<ConstantInt>(
+            ConstantInt::get(Cond->getType(), MissingCaseVal));
+        SwitchInstProfUpdateWrapper SIW(*SI);
+        SIW.addCase(MissingCase, SI->getDefaultDest(),
+                    SIW.getSuccessorWeight(0));
+        createUnreachableSwitchDefault(SI, DTU,
+                                       /*RemoveOrigDefaultBlock*/ false);
+        SIW.setSuccessorWeight(0, 0);
+        return true;
+      }
     }
   }
 
@@ -7570,6 +7585,81 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   return true;
 }
 
+/// Tries to transform the switch when the condition is umin with a constant.
+/// In that case, the default branch can be replaced by the constant's branch.
+/// This method also removes dead cases when the simplification cannot replace
+/// the default branch.
+///
+/// For example:
+/// switch(umin(a, 3)) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// case 3:
+/// case 4:
+///   // ...
+/// default:
+///   unreachable
+/// }
+///
+/// Transforms into:
+///
+/// switch(a) {
+/// case 0:
+/// case 1:
+/// case 2:
+/// default:
+///   // This is case 3
+/// }
+static bool simplifySwitchWhenUMin(SwitchInst *SI, DomTreeUpdater *DTU) {
+  Value *A;
+  ConstantInt *Constant;
+
+  if (!match(SI->getCondition(), m_UMin(m_Value(A), m_ConstantInt(Constant))))
+    return false;
+
+  SmallVector<DominatorTree::UpdateType> Updates;
+  SwitchInstProfUpdateWrapper SIW(*SI);
+  BasicBlock *BB = SIW->getParent();
+
+  // Dead cases are removed even when the simplification fails.
+  // A case is dead when its value is higher than the Constant.
+  for (auto I = SI->case_begin(), E = SI->case_end(); I != E;) {
+    if (!I->getCaseValue()->getValue().ugt(Constant->getValue())) {
+      ++I;
+      continue;
+    }
+    BasicBlock *DeadCaseBB = I->getCaseSuccessor();
+    DeadCaseBB->removePredecessor(BB);
+    Updates.push_back({DominatorTree::Delete, BB, DeadCaseBB});
+    I = SIW->removeCase(I);
+    E = SIW->case_end();
+  }
+
+  auto Case = SI->findCaseValue(Constant);
+  // If the case value is not found, `findCaseValue` returns the default case.
+  // In this scenario, since there is no explicit `case 3:`, the simplification
+  // fails. The simplification also fails when the switch’s default destination
+  // is reachable.
+  if (!SI->defaultDestUnreachable() || Case == SI->case_default()) {
+    if (DTU)
+      DTU->applyUpdates(Updates);
+    return !Updates.empty();
+  }
+
+  BasicBlock *Unreachable = SI->getDefaultDest();
+  SIW.replaceDefaultDest(Case);
+  SIW.removeCase(Case);
+  SIW->setCondition(A);
+
+  Updates.push_back({DominatorTree::Delete, BB, Unreachable});
+
+  if (DTU)
+    DTU->applyUpdates(Updates);
+
+  return true;
+}
+
 /// Tries to transform switch of powers of two to reduce switch range.
 /// For example, switch like:
 /// switch (C) { case 1: case 2: case 64: case 128: }
@@ -8037,6 +8127,9 @@ bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (simplifyDuplicateSwitchArms(SI, DTU))
     return requestResimplify();
 
+  if (simplifySwitchWhenUMin(SI, DTU))
+    return requestResimplify();
+
   return false;
 }
 
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 94c5c1709f43e..e86ab13094b15 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -158,6 +158,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   SmallVector<BasicBlock *, 8> CallBrTargetBlocksToFix;
   // Redirect exiting edges through a control flow hub.
   ControlFlowHub CHub;
+  bool Changed = false;
 
   for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
     BasicBlock *BB = ExitingBlocks[I];
@@ -182,6 +183,10 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
         bool UpdatedLI = false;
         BasicBlock *NewSucc =
             SplitCallBrEdge(BB, Succ, J, &DTU, nullptr, &LI, &UpdatedLI);
+        // SplitCallBrEdge modifies the CFG because it creates an intermediate
+        // block. So we need to set the changed flag no matter what the
+        // ControlFlowHub is going to do later.
+        Changed = true;
         // Even if CallBr and Succ do not have a common parent loop, we need to
         // add the new target block to the parent loop of the current loop.
         if (!UpdatedLI)
@@ -207,6 +212,7 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
   bool ChangedCFG;
   std::tie(LoopExitBlock, ChangedCFG) = CHub.finalize(
       &DTU, GuardBlocks, "loop.exit", MaxBooleansInControlFlowHub.getValue());
+  ChangedCFG |= Changed;
   if (!ChangedCFG)
     return false;
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index fdfff16132093..03112c67dda7b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -462,8 +462,9 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
 
   bool CanAddPredicate = !llvm::shouldOptimizeForSize(
       TheLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
-  int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, Strides,
-                            CanAddPredicate, false).value_or(0);
+  int Stride = getPtrStride(PSE, AccessTy, Ptr, TheLoop, *DT, Strides,
+                            CanAddPredicate, false)
+                   .value_or(0);
   if (Stride == 1 || Stride == -1)
     return Stride;
   return 0;
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 34b405ced8c0a..bf3f52c51b64c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -20975,6 +20975,27 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
   if (isa<PHINode>(S.getMainOp()) ||
       isVectorLikeInstWithConstOps(S.getMainOp()))
     return nullptr;
+  // If the parent node is non-schedulable and the current node is copyable, and
+  // any of parent instructions are used outside several basic blocks or in
+  // bin-op node - cancel scheduling, it may cause wrong def-use deps in
+  // analysis, leading to a crash.
+  // Non-scheduled nodes may not have related ScheduleData model, which may lead
+  // to a skipped dep analysis.
+  if (S.areInstructionsWithCopyableElements() && EI && EI.UserTE->hasState() &&
+      EI.UserTE->doesNotNeedToSchedule() &&
+      EI.UserTE->getOpcode() != Instruction::PHI &&
+      any_of(EI.UserTE->Scalars, [](Value *V) {
+        auto *I = dyn_cast<Instruction>(V);
+        if (!I || I->hasOneUser())
+          return false;
+        for (User *U : I->users()) {
+          auto *UI = cast<Instruction>(U);
+          if (isa<BinaryOperator>(UI))
+            return true;
+        }
+        return false;
+      }))
+    return std::nullopt;
   bool HasCopyables = S.areInstructionsWithCopyableElements();
   if (((!HasCopyables && doesNotNeedToSchedule(VL)) ||
        all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); }))) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e1da070a1fb7f..cfe1f1e9d7528 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1110,9 +1110,8 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags,
   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
 
   VPInstruction *clone() override {
-    SmallVector<VPValue *, 2> Operands(operands());
-    auto *New =
-        new VPInstruction(Opcode, Operands, *this, *this, getDebugLoc(), Name);
+    auto *New = new VPInstruction(Opcode, operands(), *this, *this,
+                                  getDebugLoc(), Name);
     if (getUnderlyingValue())
       New->setUnderlyingValue(getUnderlyingInstr());
     return New;
@@ -1226,10 +1225,9 @@ class VPInstructionWithType : public VPInstruction {
   }
 
   VPInstruction *clone() override {
-    SmallVector<VPValue *, 2> Operands(operands());
     auto *New =
-        new VPInstructionWithType(getOpcode(), Operands, getResultType(), *this,
-                                  getDebugLoc(), getName());
+        new VPInstructionWithType(getOpcode(), operands(), getResultType(),
+                                  *this, getDebugLoc(), getName());
     New->setUnderlyingValue(getUnderlyingValue());
     return New;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 9d9bb14530539..2588c878d8472 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -154,27 +154,32 @@ static bool sinkScalarOperands(VPlan &Plan) {
   bool ScalarVFOnly = Plan.hasScalarVFOnly();
   bool Changed = false;
 
-  auto IsValidSinkCandidate = [ScalarVFOnly](VPBasicBlock *SinkTo,
-                                             VPSingleDefRecipe *Candidate) {
-    // We only know how to duplicate VPReplicateRecipes and
-    // VPScalarIVStepsRecipes for now.
+  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
+  auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
+                                        VPBasicBlock *SinkTo, VPValue *Op) {
+    auto *Candidate =
+        dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
+    if (!Candidate)
+      return;
+
+    // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
+    // for now.
     if (!isa<VPReplicateRecipe, VPScalarIVStepsRecipe>(Candidate))
-      return false;
+      return;
 
     if (Candidate->getParent() == SinkTo || Candidate->mayHaveSideEffects() ||
         Candidate->mayReadOrWriteMemory())
-      return false;
+      return;
 
     if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
       if (!ScalarVFOnly && RepR->isSingleScalar())
-        return false;
+        return;
 
-    return true;
+    WorkList.insert({SinkTo, Candidate});
   };
 
   // First, collect the operands of all recipes in replicate blocks as seeds for
   // sinking.
-  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
   for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
@@ -182,14 +187,9 @@ static bool sinkScalarOperands(VPlan &Plan) {
     VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
     if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
       continue;
-    for (auto &Recipe : *VPBB) {
-      for (VPValue *Op : Recipe.operands()) {
-        if (auto *Def =
-                dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-          if (IsValidSinkCandidate(VPBB, Def))
-            WorkList.insert({VPBB, Def});
-      }
-    }
+    for (auto &Recipe : *VPBB)
+      for (VPValue *Op : Recipe.operands())
+        InsertIfValidSinkCandidate(VPBB, Op);
   }
 
   // Try to sink each replicate or scalar IV steps recipe in the worklist.
@@ -198,8 +198,8 @@ static bool sinkScalarOperands(VPlan &Plan) {
     VPSingleDefRecipe *SinkCandidate;
     std::tie(SinkTo, SinkCandidate) = WorkList[I];
 
-    // All recipe users of the sink candidate must be in the same block SinkTo
-    // or all users outside of SinkTo must have only their first lane used. In
+    // All recipe users of SinkCandidate must be in the same block SinkTo or all
+    // users outside of SinkTo must only use the first lane of SinkCandidate. In
     // the latter case, we need to duplicate SinkCandidate.
     auto UsersOutsideSinkTo =
         make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
@@ -234,10 +234,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
     }
     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
     for (VPValue *Op : SinkCandidate->operands())
-      if (auto *Def =
-              dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
-        if (IsValidSinkCandidate(SinkTo, Def))
-          WorkList.insert({SinkTo, Def});
+      InsertIfValidSinkCandidate(SinkTo, Op);
     Changed = true;
   }
   return Changed;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 91734a10cb2c8..34754a1ea3992 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -252,6 +252,13 @@ bool VPlanVerifier::verifyVPBasicBlock(const VPBasicBlock *VPBB) {
 
       for (const VPUser *U : V->users()) {
         auto *UI = cast<VPRecipeBase>(U);
+        if (isa<VPIRPhi>(UI) &&
+            UI->getNumOperands() != UI->getParent()->getNumPredecessors()) {
+          errs() << "Phi-like recipe with different number of operands and "
+                    "predecessors.\n";
+          return false;
+        }
+
         if (auto *Phi = dyn_cast<VPPhiAccessors>(UI)) {
           for (const auto &[IncomingVPV, IncomingVPBB] :
                Phi->incoming_values_and_blocks()) {
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index d6eb00da11dc8..27a8bbd5776be 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2017,8 +2017,31 @@ bool VectorCombine::scalarizeExtExtract(Instruction &I) {
 
   Value *ScalarV = Ext->getOperand(0);
   if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
-                                 &DT))
-    ScalarV = Builder.CreateFreeze(ScalarV);
+                                 &DT)) {
+    // Check wether all lanes are extracted, all extracts trigger UB
+    // on poison, and the last extract (and hence all previous ones)
+    // are guaranteed to execute if Ext executes.  If so, we do not
+    // need to insert a freeze.
+    SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
+    bool AllExtractsTriggerUB = true;
+    ExtractElementInst *LastExtract = nullptr;
+    BasicBlock *ExtBB = Ext->getParent();
+    for (User *U : Ext->users()) {
+      auto *Extract = cast<ExtractElementInst>(U);
+      if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
+        AllExtractsTriggerUB = false;
+        break;
+      }
+      ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
+      if (!LastExtract || LastExtract->comesBefore(Extract))
+        LastExtract = Extract;
+    }
+    if (ExtractedLanes.size() != DstTy->getNumElements() ||
+        !AllExtractsTriggerUB ||
+        !isGuaranteedToTransferExecutionToSuccessor(Ext->getIterator(),
+                                                    LastExtract->getIterator()))
+      ScalarV = Builder.CreateFreeze(ScalarV);
+  }
   ScalarV = Builder.CreateBitCast(
       ScalarV,
       IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
index 7e1588f427be4..76f73e43a2355 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-widening.ll
@@ -325,14 +325,14 @@ define void @extaddv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = add <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -434,24 +434,24 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = add <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -464,14 +464,14 @@ define void @extaddv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -573,24 +573,24 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = add <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = add <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = add <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = add <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = add <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = add <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = add <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = add <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = add <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = add <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -603,14 +603,14 @@ define void @extaddv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = add <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = add <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = add <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = add <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = add <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = add <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
@@ -1020,14 +1020,14 @@ define void @extsubv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = sub <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -1129,24 +1129,24 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = sub <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -1159,14 +1159,14 @@ define void @extsubv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -1268,24 +1268,24 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = sub <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = sub <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = sub <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = sub <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = sub <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = sub <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = sub <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = sub <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = sub <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = sub <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -1298,14 +1298,14 @@ define void @extsubv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = sub <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = sub <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = sub <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = sub <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = sub <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = sub <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
@@ -1715,14 +1715,14 @@ define void @extmulv4(<4 x i8> %i8, <4 x i16> %i16, <4 x i32> %i32, <4 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_16_32 = mul <4 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <4 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <4 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <4 x i16> %i16 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <4 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <4 x i16> %i16 to <4 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <4 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <4 x i32> %i32 to <4 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <4 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <4 x i32> %i32 to <4 x i64>
@@ -1824,24 +1824,24 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %azl_8_16 = mul <8 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <8 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <8 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <8 x i8> %i8 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <8 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <8 x i8> %i8 to <8 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <8 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <8 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <8 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <8 x i8> %i8 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <8 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <8 x i8> %i8 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <8 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <8 x i16> %i16 to <8 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <8 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <8 x i16> %i16 to <8 x i32>
@@ -1854,14 +1854,14 @@ define void @extmulv8(<8 x i8> %i8, <8 x i16> %i16, <8 x i32> %i32, <8 x i64> %i
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <8 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <8 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <8 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <8 x i16> %i16 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <8 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <8 x i16> %i16 to <8 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <8 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <8 x i32> %i32 to <8 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <8 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <8 x i32> %i32 to <8 x i64>
@@ -1963,24 +1963,24 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_16 = mul <16 x i16> %zl1_8_16, %zl2_8_16
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_32 = sext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_32 = mul <16 x i32> %i32, %sw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_32 = sext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_32 = mul <16 x i32> %sl1_8_32, %sl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_32 = zext <16 x i8> %i8 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_32 = mul <16 x i32> %i32, %zw_8_32
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_32 = zext <16 x i8> %i8 to <16 x i32>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_32 = mul <16 x i32> %zl1_8_32, %zl2_8_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sw_8_64 = sext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_8_64 = mul <16 x i64> %i64, %sw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_8_64 = sext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %asl_8_64 = mul <16 x i64> %sl1_8_64, %sl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zw_8_64 = zext <16 x i8> %i8 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_8_64 = mul <16 x i64> %i64, %zw_8_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:15 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_8_64 = zext <16 x i8> %i8 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:1 Lat:1 SizeLat:1 for: %azl_8_64 = mul <16 x i64> %zl1_8_64, %zl2_8_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_32 = sext <16 x i16> %i16 to <16 x i32>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_32 = mul <16 x i32> %i32, %sw_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_32 = sext <16 x i16> %i16 to <16 x i32>
@@ -1993,14 +1993,14 @@ define void @extmulv16(<16 x i8> %i8, <16 x i16> %i16, <16 x i32> %i32, <16 x i6
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_32 = mul <16 x i32> %zl1_16_32, %zl2_16_32
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sw_16_64 = sext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_16_64 = mul <16 x i64> %i64, %sw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl2_16_64 = sext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %asl_16_64 = mul <16 x i64> %sl1_16_64, %sl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zw_16_64 = zext <16 x i16> %i16 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azw_16_64 = mul <16 x i64> %i64, %zw_16_64
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
-; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl1_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %zl2_16_64 = zext <16 x i16> %i16 to <16 x i64>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:1 SizeLat:1 for: %azl_16_64 = mul <16 x i64> %zl1_16_64, %zl2_16_64
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: %sw_32_64 = sext <16 x i32> %i32 to <16 x i64>
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:112 CodeSize:1 Lat:1 SizeLat:1 for: %asw_32_64 = mul <16 x i64> %i64, %sw_32_64
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %sl1_32_64 = sext <16 x i32> %i32 to <16 x i64>
diff --git a/llvm/test/Analysis/CostModel/AArch64/sincos.ll b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
index 32408acb582d0..48537f6012dd5 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sincos.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sincos.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --filter "sincos"
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s
 ; RUN: opt < %s -mtriple=aarch64-gnu-linux -mattr=+neon,+sve -vector-library=ArmPL -passes="print<cost-model>" -intrinsic-cost-strategy=intrinsic-cost -cost-kind=throughput 2>&1 -disable-output | FileCheck %s -check-prefix=CHECK-VECLIB
+; RUN: opt < %s -mtriple=arm64-apple-macos10.9 -mattr=+neon -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefix=SINCOS_STRET %s
 
 define void @sincos() {
 ; CHECK-LABEL: 'sincos'
@@ -8,13 +9,11 @@ define void @sincos() {
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
 ; CHECK:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
 ; CHECK:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
@@ -26,18 +25,32 @@ define void @sincos() {
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
-;
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
-;
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
 ; CHECK-VECLIB:  Cost Model: Found an estimated cost of 13 for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
 ; CHECK-VECLIB:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
+;
+; SINCOS_STRET-LABEL: 'sincos'
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 1 for instruction: %f16 = call { half, half } @llvm.sincos.f16(half poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f32 = call { float, float } @llvm.sincos.f32(float poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f64 = call { double, double } @llvm.sincos.f64(double poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %f128 = call { fp128, fp128 } @llvm.sincos.f128(fp128 poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 36 for instruction: %v8f16 = call { <8 x half>, <8 x half> } @llvm.sincos.v8f16(<8 x half> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 52 for instruction: %v4f32 = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 24 for instruction: %v2f64 = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 10 for instruction: %v1f128 = call { <1 x fp128>, <1 x fp128> } @llvm.sincos.v1f128(<1 x fp128> poison)
+; SINCOS_STRET:  Cost Model: Found an estimated cost of 104 for instruction: %v8f32 = call { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv8f16 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.sincos.nxv8f16(<vscale x 8 x half> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv4f32 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.sincos.nxv4f32(<vscale x 4 x float> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv2f64 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.sincos.nxv2f64(<vscale x 2 x double> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv1f128 = call { <vscale x 1 x fp128>, <vscale x 1 x fp128> } @llvm.sincos.nxv1f128(<vscale x 1 x fp128> poison)
+; SINCOS_STRET:  Cost Model: Invalid cost for instruction: %nxv8f32 = call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.sincos.nxv8f32(<vscale x 8 x float> poison)
 ;
   %f16 = call { half, half } @llvm.sincos.f16(half poison)
   %f32 = call { float, float } @llvm.sincos.f32(float poison)
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
index 4c2a9c3f29f02..d90a97f1651e6 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/inbounds-gep-in-predicated-blocks.ll
@@ -10,7 +10,7 @@
 ;   s0 += (1ULL << 62) + 1;
 ;   s1 += (1ULL << 62) + 2;
 ; }
-; FIXME: We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
+; We cannot use inbounds on idx.0, idx.1 to infer no-wrap (and determine
 ; there are no dependences), as the pointers are not dereferenced in all loop iterations.
 define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
 ; CHECK-LABEL: 'test_inbounds_gep_used_in_predicated_block'
@@ -19,9 +19,14 @@ define void @test_inbounds_gep_used_in_predicated_block(ptr %A, i64 %n) {
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
@@ -63,9 +68,14 @@ define void @test_inbounds_gep_used_in_predicated_block_stored_value_operand(ptr
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
@@ -109,9 +119,14 @@ define void @test_inbounds_gep_used_in_predicated_block_non_memop_user(ptr %A, i
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %A High: (-4611686018427387705 + %A))
+; CHECK-NEXT:            Member: {%A,+,4611686018427387906}<%loop.header>
+; CHECK-NEXT:            Member: {%A,+,4611686018427387905}<%loop.header>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {%A,+,4611686018427387906}<%loop.header> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index aef7810fe2c3b..107a98aebeeb8 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -521,6 +521,11 @@ define void @f_sanitize_alloc_token() sanitize_alloc_token {
         ret void;
 }
 
+; CHECK: define void @f_no_create_undef_or_poison() #56
+define void @f_no_create_undef_or_poison() nocreateundeforpoison {
+        ret void;
+}
+
 ; CHECK: define void @f87() [[FNRETTHUNKEXTERN:#[0-9]+]]
 define void @f87() fn_ret_thunk_extern { ret void }
 
@@ -633,6 +638,7 @@ define void @dead_on_return(ptr dead_on_return %p) {
 ; CHECK: attributes #53 = { sanitize_realtime }
 ; CHECK: attributes #54 = { sanitize_realtime_blocking }
 ; CHECK: attributes #55 = { sanitize_alloc_token }
+; CHECK: attributes #56 = { nocreateundeforpoison }
 ; CHECK: attributes [[FNRETTHUNKEXTERN]] = { fn_ret_thunk_extern }
 ; CHECK: attributes [[SKIPPROFILE]] = { skipprofile }
 ; CHECK: attributes [[OPTDEBUG]] = { optdebug }
diff --git a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
index cad5df0d9655e..68ab8902767b3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ccmp.ll
@@ -430,12 +430,12 @@ declare i32 @foo()
 
 ; Test case distilled from 126.gcc.
 ; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
-define void @build_modify_expr() nounwind ssp {
+define void @build_modify_expr(i32 %cond) nounwind ssp {
 ; CHECK-LABEL: build_modify_expr:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    ret
 entry:
-  switch i32 undef, label %sw.bb.i.i [
+  switch i32 %cond, label %sw.bb.i.i [
     i32 69, label %if.end85
     i32 70, label %if.end85
     i32 71, label %if.end85
diff --git a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
index bdbc99e2d98b0..75e7ac902274d 100644
--- a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
+++ b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
@@ -2,15 +2,58 @@
 
 declare void @called()
 declare void @escaped()
-define void @f(ptr %dst) {
+define void @f(ptr %dst, ptr readonly %f) {
   call void @called()
+; CHECK:         bl      "#called"
   store ptr @escaped, ptr %dst
-  ret void
+  call void %f()
+; CHECK:       adrp    x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:  add     x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:  str     x8, [x20]
+; CHECK-NEXT:  adrp    x8, __os_arm64x_check_icall_cfg
+; CHECK-NEXT:  ldr     x8, [x8, :lo12:__os_arm64x_check_icall_cfg]
+; CHECK-NEXT:  mov     x11,
+; CHECK-NEXT:  blr     x8
+; CHECK-NEXT:  blr     x11
+    ret void
 }
 
+; CHECK-LABEL:    .def "#called$exit_thunk";
+; CHECK-NEXT:     .scl 2;
+; CHECK-NEXT:     .type 32;
+; CHECK-NEXT:     .endef
+; CHECK-NEXT:     .section .wowthk$aa,"xr",discard,"#called$exit_thunk"
+; CHECK-NEXT:     .globl "#called$exit_thunk"            // -- Begin function #called$exit_thunk
+; CHECK-NEXT:     .p2align 2
+; CHECK-NEXT: "#called$exit_thunk":                   // @"#called$exit_thunk"
+; CHECK-NEXT:     .weak_anti_dep called
+; CHECK-NEXT: called = "#called"
+; CHECK-NEXT:     .weak_anti_dep "#called"
+; CHECK-NEXT: "#called" = "#called$exit_thunk"
+; CHECK-NEXT:    .seh_proc "#called$exit_thunk"
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT:     str x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp x11, called
+; CHECK-NEXT:     add x11, x11, :lo12:called
+; CHECK-NEXT:     ldr x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     add x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     blr x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
 !llvm.module.flags = !{!0}
-!0 = !{i32 2, !"cfguard", i32 1}
+!0 = !{i32 2, !"cfguard", i32 2}
 
 ; CHECK-LABEL: .section .gfids$y,"dr"
 ; CHECK-NEXT:  .symidx escaped
+; CHECK-NEXT:  .symidx $iexit_thunk$cdecl$v$v
 ; CHECK-NOT:   .symidx
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
index f535e0fe8b387..bb7ffb47d8dfe 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -162,63 +162,54 @@ body:             |
     RET_ReallyLR
 
 # CHECK-LABEL: name: test_allocate_split_sve_realigned
-# CHECK:       stackSize: 2080
+# CHECK:       stackSize: 1056
 
 # CHECK:      bb.0.entry:
 # CHECK:      liveins: $z0, $p0, $lr
-# CHECK:      $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
-# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
-# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK:      early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.5), (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
-# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $x9, -3, implicit $vg
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]], 7930
 #
 # CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
 # CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
-# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
-# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
-# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -2 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: STR_PXI $p0, $fp, -6 :: (store (<vscale x 1 x s16>) into %stack.1)
 #
-# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
-# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
-# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.5), (load (s64) from %stack.4)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
 # CHECK-NEXT: RET_ReallyLR
 
 # ASM-LABEL: test_allocate_split_sve_realigned
-# ASM:	     sub sp, sp, #1040
-# ASM-NEXT:  .cfi_def_cfa_offset 1040
-# ASM-NEXT:  str x29, [sp, #1024]
-# ASM-NEXT:  str x30, [sp, #1032]
-# ASM-NEXT:  add x29, sp, #1024
+# ASM:	     stp	x29, x30, [sp, #-16]!
+# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM-NEXT:  mov	x29, sp
 # ASM-NEXT:  .cfi_def_cfa w29, 16
 # ASM-NEXT:  .cfi_offset w30, -8
 # ASM-NEXT:  .cfi_offset w29, -16
 #
-# ASM:       sub sp, x29, #1024
-# ASM-NEXT:  .cfi_def_cfa wsp, 1040
-# ASM-NEXT:  ldr x30, [sp, #1032]
-# ASM-NEXT:  ldr x29, [sp, #1024]
-# ASM-NEXT:  add sp, sp, #1040
+# ASM:       mov	sp, x29
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
+# ASM-NEXT:  ldp	x29, x30, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
 # ASM-NEXT:  .cfi_restore w30
 # ASM-NEXT:  .cfi_restore w29
 
-# UNWINDINFO:       DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO:       DW_CFA_def_cfa_offset: +16
 # UNWINDINFO:       DW_CFA_def_cfa: reg29 +16
 # UNWINDINFO-NEXT:  DW_CFA_offset: reg30 -8
 # UNWINDINFO-NEXT:  DW_CFA_offset: reg29 -16
 #
-# UNWINDINFO:       DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO:       DW_CFA_def_cfa: reg31 +16
 # UNWINDINFO:       DW_CFA_def_cfa_offset: +0
 # UNWINDINFO-NEXT:  DW_CFA_restore: reg30
 # UNWINDINFO-NEXT:  DW_CFA_restore: reg29
diff --git a/llvm/test/CodeGen/AArch64/ldst-implicitop.mir b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir
new file mode 100644
index 0000000000000..34e8cf282669c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst-implicitop.mir
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64-- -run-pass=aarch64-ldst-opt -verify-machineinstrs -o - %s | FileCheck %s
+# Check that we copy implicit operands.
+---
+name:            impdef_op1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_op1
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...
+---
+name:            impdef_op2
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_op2
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q20, renamable $q5 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q20 = LDRQui renamable $lr, 3 :: (load (s128))
+    renamable $q5 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...
+---
+name:            impdef_both
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_both
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5, implicit-def $q20_q21 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: $q2 = ORRv16i8 $q20, killed $q20
+    ; CHECK-NEXT: $q3 = ORRv16i8 $q21, killed $q21
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q20_q21 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    $q2 = ORRv16i8 $q20, killed $q20
+    $q3 = ORRv16i8 $q21, killed $q21
+    RET_ReallyLR
+...
+---
+name:            impdef_both_same
+body:             |
+  bb.0:
+    liveins: $lr
+    ; CHECK-LABEL: name: impdef_both_same
+    ; CHECK: liveins: $lr
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: renamable $q5, renamable $q20 = LDPQi renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    ; CHECK-NEXT: $q0 = ORRv16i8 $q4, killed $q4
+    ; CHECK-NEXT: $q1 = ORRv16i8 $q5, killed $q5
+    ; CHECK-NEXT: RET_ReallyLR
+    renamable $q5 = LDRQui renamable $lr, 3, implicit-def $q4_q5 :: (load (s128))
+    renamable $q20 = LDRQui renamable $lr, 4, implicit-def $q4_q5 :: (load (s128))
+    $q0 = ORRv16i8 $q4, killed $q4
+    $q1 = ORRv16i8 $q5, killed $q5
+    RET_ReallyLR
+...
diff --git a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
index 7f0968c8eb339..f77ada4eae022 100644
--- a/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
+++ b/llvm/test/CodeGen/AArch64/preserve_mostcc.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-ios-8.0.0 | FileCheck -check-prefix CHECK -check-prefix CHECK-DARWIN %s
+; RUN: llc < %s -mtriple=aarch64-unknown-windows-msvc | FileCheck -check-prefix CHECK -check-prefix CHECK-WIN %s
 
 declare void @standard_cc_func()
 declare preserve_mostcc void @preserve_mostcc_func()
@@ -8,18 +9,26 @@ declare preserve_mostcc void @preserve_mostcc_func()
 define preserve_mostcc void @preserve_mostcc1() nounwind {
 entry:
 ;CHECK-LABEL: preserve_mostcc1
-;CHECK-NOT:   stp
-;CHECK-NOT:   str
-;CHECK:       str     x15
-;CHECK-NEXT:  stp     x14, x13,
-;CHECK-NEXT:  stp     x12, x11,
-;CHECK-NEXT:  stp     x10, x9,
-;CHECK:       bl      _standard_cc_func
+;CHECK-DARWIN-NOT:   stp
+;CHECK-DARWIN-NOT:   str
+;CHECK-DARWIN:       str     x15
+;CHECK-DARWIN-NEXT:  stp     x14, x13,
+;CHECK-DARWIN-NEXT:  stp     x12, x11,
+;CHECK-DARWIN-NEXT:  stp     x10, x9,
+;CHECK-WIN:       stp     x15, x14
+;CHECK-WIN-NEXT:  stp     x13, x12,
+;CHECK-WIN-NEXT:  stp     x11, x10,
+;CHECK-WIN-NEXT:  stp     x9, x30
+;CHECK:       bl      {{_?}}standard_cc_func
   call void @standard_cc_func()
-;CHECK:       ldp     x10, x9,
-;CHECK-NEXT:  ldp     x12, x11,
-;CHECK-NEXT:  ldp     x14, x13,
-;CHECK-NEXT:  ldr     x15
+;CHECK-DARWIN:       ldp     x10, x9,
+;CHECK-DARWIN-NEXT:  ldp     x12, x11,
+;CHECK-DARWIN-NEXT:  ldp     x14, x13,
+;CHECK-DARWIN-NEXT:  ldr     x15
+;CHECK-WIN:       ldp     x9, x30
+;CHECK-WIN-NEXT:  ldp     x11, x10,
+;CHECK-WIN-NEXT:  ldp     x13, x12,
+;CHECK-WIN-NEXT:  ldp     x15, x14,
   ret void
 }
 
@@ -31,9 +40,10 @@ define preserve_mostcc void @preserve_mostcc2() nounwind {
 entry:
 ;CHECK-LABEL: preserve_mostcc2
 ;CHECK-NOT: x14
-;CHECK:     stp     x29, x30,
+;CHECK-DARWIN:     stp     x29, x30,
+;CHECK-WIN:     str     x30
 ;CHECK-NOT: x14
-;CHECK:     bl      _preserve_mostcc_func
+;CHECK:     bl      {{_?}}preserve_mostcc_func
   call preserve_mostcc void @preserve_mostcc_func()
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
index 71c6380177b3a..8a0ac6d4ace7a 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
@@ -780,6 +780,7 @@ define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 {
 
 attributes #0 = { "target-features"="+sve" }
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR1]] = { "target-features"="+sve" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
index c13dd33865c37..f65aec6665cec 100644
--- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -737,36 +737,23 @@ entry:
 }
 declare ptr @memset(ptr, i32, i32)
 
-; FIXME: aarch64-split-sve-objects is currently not supported in this function
-; as it requires stack reealignment (for the 32-byte aligned alloca).
-; GPR CSRs
-; <hazard padding>
-; FPR CSRs
-; <hazrd padding>
-; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
-; <realignment padding>
-; -> sp
 define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
 ; CHECK-LABEL: zpr_and_ppr_local_realignment:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #1040
-; CHECK-NEXT:    sub x9, sp, #1040
-; CHECK-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK-NEXT:    add x29, sp, #1024
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    sub x9, sp, #2064
+; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    addvl x9, x9, #-2
-; CHECK-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
 ; CHECK-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    sub x8, x29, #1024
-; CHECK-NEXT:    str p0, [x8, #-1, mul vl]
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
 ; CHECK-NEXT:    str z0, [x8, #-2, mul vl]
 ; CHECK-NEXT:    str x0, [sp]
-; CHECK-NEXT:    sub sp, x29, #1024
-; CHECK-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #1040
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
   %zpr_local = alloca <vscale x 16 x i8>
@@ -805,3 +792,316 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
   store volatile i64 %gpr, ptr %gpr_local
   ret void
 }
+
+; Only PPR callee-saves + a VLA
+; Expect: No hazard padding. Frame pointer (x29), p4-p6 callee saves allocated
+; with `addvl #-1`, PPR saves restored using frame pointer `addvl sp, x29, #-1`.
+define aarch64_sve_vector_pcs void @only_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+  ret void
+}
+
+; Only ZPR callee-saves + a VLA
+; Expect: Hazard padding, Frame pointer (x29), z8-z10 callee saves allocated
+; with `addvl #-3`. ZPR saves restored from `FP - 1024 + addvl #-3`.
+define aarch64_sve_vector_pcs void @only_zpr_csr_vla(i64 %n) {
+; CHECK-LABEL: only_zpr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #1056
+; CHECK-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #1024
+; CHECK-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #1040] // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x8, #-3
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    sub sp, x29, #1024
+; CHECK-NEXT:    ldr x19, [sp, #1040] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #1056
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+  ret void
+}
+
+; PPR+ZPR callee-saves + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10)
+; callee-saves allocated separately, with hazard padding of 1024 between the
+; areas. ZPR callee saves restored by `FP - 1024 + addvl #-4`, PPR callee saves
+; restored by `FP + addvl #-1`.
+define aarch64_sve_vector_pcs void @zpr_ppr_csr_vla(i64 %n) {
+; CHECK-LABEL: zpr_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-3
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    addvl sp, x8, #-4
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  call void (...) @llvm.fake.use(ptr %alloc)
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"()
+  ret void
+}
+
+; Only PPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) callee-saves, with
+; hazard padding after the PPR callee saves (1024) and after the FPR local area
+; (1024) -- coeleased to 2048. Only PPRs restored by moving the SP to
+; `FP + addvl #-1`.
+define void @sve_locals_only_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_only_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #-2
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-9, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-3, mul vl]
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
+
+; Only ZPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), ZPR (z8-z10) callee-saves, with
+; hazard padding before the ZPR callee saves (1024) and after the ZPR local area
+; (1024). Only ZPRs restored by moving the SP to `FP - 1024 + addvl #-4`.
+define void @sve_locals_only_zpr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_only_zpr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-5, mul vl]
+; CHECK-NEXT:    addvl sp, x8, #-4
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{z8},~{z9},~{z10}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
+
+; PPR+ZPR callee-saves (and ZPR/PPR locals) + a VLA
+; Expect: Hazard padding, Frame pointer (x29), PPR (p4-p6) and ZPR (z8-z10)
+; callee-saves, with hazard padding before the ZPR callee saves (1024) and after
+; the ZPR local area (1024). ZPRs restored by moving the SP to
+; `FP - 1024 + addvl #-5`, PPRs restored by moving SP to `FP + addvl #-1`.
+define void @sve_locals_zpr_ppr_csr_vla(i64 %n, <vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) {
+; CHECK-LABEL: sve_locals_zpr_ppr_csr_vla:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    str p6, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-4
+; CHECK-NEXT:    str z10, [sp] // 16-byte Folded Spill
+; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    .cfi_def_cfa w29, 32
+; CHECK-NEXT:    .cfi_offset w19, -16
+; CHECK-NEXT:    .cfi_offset w30, -24
+; CHECK-NEXT:    .cfi_offset w29, -32
+; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1056
+; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1056
+; CHECK-NEXT:    add x9, x0, #15
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    // fake_use: $x8
+; CHECK-NEXT:    sub x8, x29, #1024
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    str p0, [x29, #-9, mul vl]
+; CHECK-NEXT:    str z0, [x8, #-6, mul vl]
+; CHECK-NEXT:    addvl sp, x8, #-5
+; CHECK-NEXT:    ldr z10, [sp] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    addvl sp, x29, #-1
+; CHECK-NEXT:    ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %alloc = alloca i8, i64 %n, align 1
+  %ppr_local = alloca <vscale x 16 x i1>
+  %zpr_local = alloca <vscale x 16 x i8>
+  tail call void asm sideeffect "", "~{p4},~{p5},~{p6},~{z8},~{z9},~{z10}"()
+  call void (...) @llvm.fake.use(ptr %alloc)
+  store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+  store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index bdee359487ce6..70874761b82ab 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -3512,14 +3512,13 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ;
 ; CHECK64-LABEL: svecc_call_dynamic_alloca:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    .cfi_def_cfa_offset 128
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
 ; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x20, x19, [sp, #112] // 16-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
+; CHECK64-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
 ; CHECK64-NEXT:    .cfi_offset w19, -8
 ; CHECK64-NEXT:    .cfi_offset w20, -16
@@ -3529,7 +3528,7 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -3542,30 +3541,32 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
 ; CHECK64-NEXT:    sub sp, sp, #64
 ; CHECK64-NEXT:    mov x19, sp
 ; CHECK64-NEXT:    mov w2, w1
@@ -3595,22 +3596,31 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -3623,21 +3633,12 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    .cfi_restore z8
-; CHECK64-NEXT:    .cfi_restore z9
-; CHECK64-NEXT:    .cfi_restore z10
-; CHECK64-NEXT:    .cfi_restore z11
-; CHECK64-NEXT:    .cfi_restore z12
-; CHECK64-NEXT:    .cfi_restore z13
-; CHECK64-NEXT:    .cfi_restore z14
-; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x20, x19, [sp, #112] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x28, [sp, #88] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x27, x26, [sp, #96] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w20
@@ -3649,305 +3650,444 @@ define i32 @svecc_call_dynamic_alloca(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_dynamic_alloca:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -8
-; CHECK1024-NEXT:    .cfi_offset w20, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
-; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT:    sub sp, sp, #1024
-; CHECK1024-NEXT:    mov x19, sp
-; CHECK1024-NEXT:    mov w2, w1
-; CHECK1024-NEXT:    mov w8, w0
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    mov w8, w8
-; CHECK1024-NEXT:    mov x9, sp
-; CHECK1024-NEXT:    mov x20, x0
-; CHECK1024-NEXT:    add x8, x8, #15
-; CHECK1024-NEXT:    and x8, x8, #0x1fffffff0
-; CHECK1024-NEXT:    sub x8, x9, x8
-; CHECK1024-NEXT:    mov sp, x8
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    tbz w20, #0, .LBB35_2
-; CHECK1024-NEXT:  // %bb.1: // %entry
-; CHECK1024-NEXT:    smstop sm
-; CHECK1024-NEXT:  .LBB35_2: // %entry
-; CHECK1024-NEXT:    mov x0, x8
-; CHECK1024-NEXT:    mov w1, #45 // =0x2d
-; CHECK1024-NEXT:    bl memset
-; CHECK1024-NEXT:    tbz w20, #0, .LBB35_4
-; CHECK1024-NEXT:  // %bb.3: // %entry
-; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:  .LBB35_4: // %entry
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    .cfi_restore z8
-; CHECK1024-NEXT:    .cfi_restore z9
-; CHECK1024-NEXT:    .cfi_restore z10
-; CHECK1024-NEXT:    .cfi_restore z11
-; CHECK1024-NEXT:    .cfi_restore z12
-; CHECK1024-NEXT:    .cfi_restore z13
-; CHECK1024-NEXT:    .cfi_restore z14
-; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
-; CHECK1024-NEXT:    .cfi_restore w19
-; CHECK1024-NEXT:    .cfi_restore w20
-; CHECK1024-NEXT:    .cfi_restore w26
-; CHECK1024-NEXT:    .cfi_restore w27
-; CHECK1024-NEXT:    .cfi_restore w28
-; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    .cfi_restore w30
-; CHECK1024-NEXT:    .cfi_restore w29
-; CHECK1024-NEXT:    ret
-entry:
-  %ptr = alloca i8, i32 %P1
-  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
-  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
-  ret i32 -396142473
-}
-
-
-define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
-; CHECK0-LABEL: svecc_call_realign:
-; CHECK0:       // %bb.0: // %entry
-; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_def_cfa_offset 64
-; CHECK0-NEXT:    cntd x9
-; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
-; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
-; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK0-NEXT:    mov x29, sp
-; CHECK0-NEXT:    .cfi_def_cfa w29, 64
-; CHECK0-NEXT:    .cfi_offset w19, -8
-; CHECK0-NEXT:    .cfi_offset w26, -16
-; CHECK0-NEXT:    .cfi_offset w27, -24
-; CHECK0-NEXT:    .cfi_offset w28, -32
-; CHECK0-NEXT:    .cfi_offset vg, -48
-; CHECK0-NEXT:    .cfi_offset w30, -56
-; CHECK0-NEXT:    .cfi_offset w29, -64
-; CHECK0-NEXT:    addvl sp, sp, #-18
-; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
-; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
-; CHECK0-NEXT:    sub x9, sp, #1024
-; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK0-NEXT:    mov w2, w1
-; CHECK0-NEXT:    bl __arm_sme_state
-; CHECK0-NEXT:    mov x19, x0
-; CHECK0-NEXT:    //APP
-; CHECK0-NEXT:    //NO_APP
-; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
-; CHECK0-NEXT:  // %bb.1: // %entry
-; CHECK0-NEXT:    smstop sm
-; CHECK0-NEXT:  .LBB36_2: // %entry
-; CHECK0-NEXT:    mov x0, sp
-; CHECK0-NEXT:    mov w1, #45 // =0x2d
-; CHECK0-NEXT:    bl memset
-; CHECK0-NEXT:    tbz w19, #0, .LBB36_4
-; CHECK0-NEXT:  // %bb.3: // %entry
-; CHECK0-NEXT:    smstart sm
-; CHECK0-NEXT:  .LBB36_4: // %entry
-; CHECK0-NEXT:    mov w0, #22647 // =0x5877
-; CHECK0-NEXT:    movk w0, #59491, lsl #16
-; CHECK0-NEXT:    addvl sp, x29, #-18
-; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT:    .cfi_restore z8
-; CHECK0-NEXT:    .cfi_restore z9
-; CHECK0-NEXT:    .cfi_restore z10
-; CHECK0-NEXT:    .cfi_restore z11
-; CHECK0-NEXT:    .cfi_restore z12
-; CHECK0-NEXT:    .cfi_restore z13
-; CHECK0-NEXT:    .cfi_restore z14
-; CHECK0-NEXT:    .cfi_restore z15
-; CHECK0-NEXT:    mov sp, x29
-; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
-; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
-; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
-; CHECK0-NEXT:    .cfi_def_cfa_offset 0
-; CHECK0-NEXT:    .cfi_restore w19
-; CHECK0-NEXT:    .cfi_restore w26
-; CHECK0-NEXT:    .cfi_restore w27
-; CHECK0-NEXT:    .cfi_restore w28
-; CHECK0-NEXT:    .cfi_restore vg
-; CHECK0-NEXT:    .cfi_restore w30
-; CHECK0-NEXT:    .cfi_restore w29
-; CHECK0-NEXT:    ret
-;
-; CHECK64-LABEL: svecc_call_realign:
-; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    .cfi_def_cfa_offset 128
-; CHECK64-NEXT:    cntd x9
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x9, x28, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x27, x26, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
-; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w26, -24
-; CHECK64-NEXT:    .cfi_offset w27, -32
-; CHECK64-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT:    cntd x9
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x20, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1080] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-NOSPLITSVE-NEXT:    mov w8, w0
+; CHECK1024-NOSPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT:    mov w8, w8
+; CHECK1024-NOSPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov x20, x0
+; CHECK1024-NOSPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-NOSPLITSVE-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x9, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov sp, x8
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstop sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB35_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT:    bl memset
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstart sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB35_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1080] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x20, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w20
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_alloca:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT:    cntd x9
+; CHECK1024-SPLITSVE-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x9, x28, [sp, #16] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-SPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-SPLITSVE-NEXT:    mov w8, w0
+; CHECK1024-SPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT:    mov w8, w8
+; CHECK1024-SPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-SPLITSVE-NEXT:    mov x20, x0
+; CHECK1024-SPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-SPLITSVE-NEXT:    and x8, x8, #0x1fffffff0
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x9, x8
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x8
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    tbz w20, #0, .LBB35_2
+; CHECK1024-SPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstop sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB35_2: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov x0, x8
+; CHECK1024-SPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT:    bl memset
+; CHECK1024-SPLITSVE-NEXT:    tbz w20, #0, .LBB35_4
+; CHECK1024-SPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstart sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB35_4: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr x28, [sp, #24] // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w20
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT:    ret
+entry:
+  %ptr = alloca i8, i32 %P1
+  tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+  %call = call ptr @memset(ptr noundef nonnull %ptr, i32 noundef 45, i32 noundef %P2)
+  ret i32 -396142473
+}
+
+
+define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK0-LABEL: svecc_call_realign:
+; CHECK0:       // %bb.0: // %entry
+; CHECK0-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_def_cfa_offset 64
+; CHECK0-NEXT:    cntd x9
+; CHECK0-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK0-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK0-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK0-NEXT:    mov x29, sp
+; CHECK0-NEXT:    .cfi_def_cfa w29, 64
+; CHECK0-NEXT:    .cfi_offset w19, -8
+; CHECK0-NEXT:    .cfi_offset w26, -16
+; CHECK0-NEXT:    .cfi_offset w27, -24
+; CHECK0-NEXT:    .cfi_offset w28, -32
+; CHECK0-NEXT:    .cfi_offset vg, -48
+; CHECK0-NEXT:    .cfi_offset w30, -56
+; CHECK0-NEXT:    .cfi_offset w29, -64
+; CHECK0-NEXT:    addvl sp, sp, #-18
+; CHECK0-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK0-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x48, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d8 @ cfa - 8 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x49, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d9 @ cfa - 16 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4a, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d10 @ cfa - 24 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4b, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d11 @ cfa - 32 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4c, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d12 @ cfa - 40 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4d, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d13 @ cfa - 48 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d14 @ cfa - 56 * IncomingVG - 64
+; CHECK0-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x40, 0x22 // $d15 @ cfa - 64 * IncomingVG - 64
+; CHECK0-NEXT:    sub x9, sp, #1024
+; CHECK0-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK0-NEXT:    mov w2, w1
+; CHECK0-NEXT:    bl __arm_sme_state
+; CHECK0-NEXT:    mov x19, x0
+; CHECK0-NEXT:    //APP
+; CHECK0-NEXT:    //NO_APP
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK0-NEXT:  // %bb.1: // %entry
+; CHECK0-NEXT:    smstop sm
+; CHECK0-NEXT:  .LBB36_2: // %entry
+; CHECK0-NEXT:    mov x0, sp
+; CHECK0-NEXT:    mov w1, #45 // =0x2d
+; CHECK0-NEXT:    bl memset
+; CHECK0-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK0-NEXT:  // %bb.3: // %entry
+; CHECK0-NEXT:    smstart sm
+; CHECK0-NEXT:  .LBB36_4: // %entry
+; CHECK0-NEXT:    mov w0, #22647 // =0x5877
+; CHECK0-NEXT:    movk w0, #59491, lsl #16
+; CHECK0-NEXT:    addvl sp, x29, #-18
+; CHECK0-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK0-NEXT:    .cfi_restore z8
+; CHECK0-NEXT:    .cfi_restore z9
+; CHECK0-NEXT:    .cfi_restore z10
+; CHECK0-NEXT:    .cfi_restore z11
+; CHECK0-NEXT:    .cfi_restore z12
+; CHECK0-NEXT:    .cfi_restore z13
+; CHECK0-NEXT:    .cfi_restore z14
+; CHECK0-NEXT:    .cfi_restore z15
+; CHECK0-NEXT:    mov sp, x29
+; CHECK0-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK0-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK0-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK0-NEXT:    .cfi_def_cfa_offset 0
+; CHECK0-NEXT:    .cfi_restore w19
+; CHECK0-NEXT:    .cfi_restore w26
+; CHECK0-NEXT:    .cfi_restore w27
+; CHECK0-NEXT:    .cfi_restore w28
+; CHECK0-NEXT:    .cfi_restore vg
+; CHECK0-NEXT:    .cfi_restore w30
+; CHECK0-NEXT:    .cfi_restore w29
+; CHECK0-NEXT:    ret
+;
+; CHECK64-LABEL: svecc_call_realign:
+; CHECK64:       // %bb.0: // %entry
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_def_cfa_offset 64
+; CHECK64-NEXT:    cntd x9
+; CHECK64-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
+; CHECK64-NEXT:    .cfi_def_cfa w29, 64
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w26, -16
+; CHECK64-NEXT:    .cfi_offset w27, -24
+; CHECK64-NEXT:    .cfi_offset w28, -32
 ; CHECK64-NEXT:    .cfi_offset vg, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -3960,30 +4100,32 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * IncomingVG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * IncomingVG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * IncomingVG - 128
 ; CHECK64-NEXT:    sub x9, sp, #1088
 ; CHECK64-NEXT:    and sp, x9, #0xffffffffffffffe0
 ; CHECK64-NEXT:    mov w2, w1
@@ -4006,22 +4148,31 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
+; CHECK64-NEXT:    .cfi_restore z8
+; CHECK64-NEXT:    .cfi_restore z9
+; CHECK64-NEXT:    .cfi_restore z10
+; CHECK64-NEXT:    .cfi_restore z11
+; CHECK64-NEXT:    .cfi_restore z12
+; CHECK64-NEXT:    .cfi_restore z13
+; CHECK64-NEXT:    .cfi_restore z14
+; CHECK64-NEXT:    .cfi_restore z15
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -4034,20 +4185,11 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    .cfi_restore z8
-; CHECK64-NEXT:    .cfi_restore z9
-; CHECK64-NEXT:    .cfi_restore z10
-; CHECK64-NEXT:    .cfi_restore z11
-; CHECK64-NEXT:    .cfi_restore z12
-; CHECK64-NEXT:    .cfi_restore z13
-; CHECK64-NEXT:    .cfi_restore z14
-; CHECK64-NEXT:    .cfi_restore z15
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    .cfi_def_cfa wsp, 128
-; CHECK64-NEXT:    ldp x26, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x28, x27, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK64-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK64-NEXT:    .cfi_restore w19
 ; CHECK64-NEXT:    .cfi_restore w26
@@ -4058,140 +4200,270 @@ define i32 @svecc_call_realign(<4 x i16> %P0, i32 %P1, i32 %P2, <vscale x 16 x i
 ; CHECK64-NEXT:    .cfi_restore w29
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_realign:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT:    cntd x9
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w26, -24
-; CHECK1024-NEXT:    .cfi_offset w27, -32
-; CHECK1024-NEXT:    .cfi_offset w28, -40
-; CHECK1024-NEXT:    .cfi_offset vg, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT:    sub x9, sp, #2048
-; CHECK1024-NEXT:    and sp, x9, #0xffffffffffffffe0
-; CHECK1024-NEXT:    mov w2, w1
-; CHECK1024-NEXT:    bl __arm_sme_state
-; CHECK1024-NEXT:    mov x19, x0
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    tbz w19, #0, .LBB36_2
-; CHECK1024-NEXT:  // %bb.1: // %entry
-; CHECK1024-NEXT:    smstop sm
-; CHECK1024-NEXT:  .LBB36_2: // %entry
-; CHECK1024-NEXT:    mov x0, sp
-; CHECK1024-NEXT:    mov w1, #45 // =0x2d
-; CHECK1024-NEXT:    bl memset
-; CHECK1024-NEXT:    tbz w19, #0, .LBB36_4
-; CHECK1024-NEXT:  // %bb.3: // %entry
-; CHECK1024-NEXT:    smstart sm
-; CHECK1024-NEXT:  .LBB36_4: // %entry
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    .cfi_restore z8
-; CHECK1024-NEXT:    .cfi_restore z9
-; CHECK1024-NEXT:    .cfi_restore z10
-; CHECK1024-NEXT:    .cfi_restore z11
-; CHECK1024-NEXT:    .cfi_restore z12
-; CHECK1024-NEXT:    .cfi_restore z13
-; CHECK1024-NEXT:    .cfi_restore z14
-; CHECK1024-NEXT:    .cfi_restore z15
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    .cfi_def_cfa_offset 0
-; CHECK1024-NEXT:    .cfi_restore w19
-; CHECK1024-NEXT:    .cfi_restore w26
-; CHECK1024-NEXT:    .cfi_restore w27
-; CHECK1024-NEXT:    .cfi_restore w28
-; CHECK1024-NEXT:    .cfi_restore vg
-; CHECK1024-NEXT:    .cfi_restore w30
-; CHECK1024-NEXT:    .cfi_restore w29
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_realign:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT:    cntd x9
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    sub x9, sp, #2048
+; CHECK1024-NOSPLITSVE-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-NOSPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-NOSPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstop sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB36_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, sp
+; CHECK1024-NOSPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT:    bl memset
+; CHECK1024-NOSPLITSVE-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK1024-NOSPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    smstart sm
+; CHECK1024-NOSPLITSVE-NEXT:  .LBB36_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_realign:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT:    cntd x9
+; CHECK1024-SPLITSVE-NEXT:    stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT:    sub x9, sp, #2048
+; CHECK1024-SPLITSVE-NEXT:    and sp, x9, #0xffffffffffffffe0
+; CHECK1024-SPLITSVE-NEXT:    mov w2, w1
+; CHECK1024-SPLITSVE-NEXT:    bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT:    mov x19, x0
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB36_2
+; CHECK1024-SPLITSVE-NEXT:  // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstop sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB36_2: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov x0, sp
+; CHECK1024-SPLITSVE-NEXT:    mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT:    bl memset
+; CHECK1024-SPLITSVE-NEXT:    tbz w19, #0, .LBB36_4
+; CHECK1024-SPLITSVE-NEXT:  // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT:    smstart sm
+; CHECK1024-SPLITSVE-NEXT:  .LBB36_4: // %entry
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT:    ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT:    .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %ptr = alloca i8, i32 1000, align 32
   tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
@@ -4311,13 +4583,12 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ;
 ; CHECK64-LABEL: svecc_call_dynamic_and_scalable_alloca:
 ; CHECK64:       // %bb.0: // %entry
-; CHECK64-NEXT:    sub sp, sp, #128
-; CHECK64-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT:    add x29, sp, #64
-; CHECK64-NEXT:    stp x28, x27, [sp, #80] // 16-byte Folded Spill
-; CHECK64-NEXT:    stp x26, x20, [sp, #96] // 16-byte Folded Spill
-; CHECK64-NEXT:    str x19, [sp, #112] // 8-byte Folded Spill
-; CHECK64-NEXT:    addvl sp, sp, #-18
+; CHECK64-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK64-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK64-NEXT:    mov x29, sp
+; CHECK64-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK64-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK64-NEXT:    addvl sp, sp, #-2
 ; CHECK64-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
@@ -4330,41 +4601,43 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
 ; CHECK64-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK64-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    sub sp, sp, #64
+; CHECK64-NEXT:    addvl sp, sp, #-16
+; CHECK64-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK64-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
 ; CHECK64-NEXT:    sub sp, sp, #112
 ; CHECK64-NEXT:    addvl sp, sp, #-1
 ; CHECK64-NEXT:    mov x19, sp
 ; CHECK64-NEXT:    .cfi_def_cfa w29, 64
-; CHECK64-NEXT:    .cfi_offset w19, -16
-; CHECK64-NEXT:    .cfi_offset w20, -24
-; CHECK64-NEXT:    .cfi_offset w26, -32
-; CHECK64-NEXT:    .cfi_offset w27, -40
+; CHECK64-NEXT:    .cfi_offset w19, -8
+; CHECK64-NEXT:    .cfi_offset w20, -16
+; CHECK64-NEXT:    .cfi_offset w26, -24
+; CHECK64-NEXT:    .cfi_offset w27, -32
 ; CHECK64-NEXT:    .cfi_offset w28, -48
 ; CHECK64-NEXT:    .cfi_offset w30, -56
 ; CHECK64-NEXT:    .cfi_offset w29, -64
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 8 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 16 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 24 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 32 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 40 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 48 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 56 * VG - 128
-; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 64 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d8 @ cfa - 24 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d9 @ cfa - 32 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d10 @ cfa - 40 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d11 @ cfa - 48 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d12 @ cfa - 56 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d13 @ cfa - 64 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d14 @ cfa - 72 * VG - 128
+; CHECK64-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0x80, 0x7f, 0x22 // $d15 @ cfa - 80 * VG - 128
 ; CHECK64-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK64-NEXT:    ubfiz x8, x0, #2, #32
 ; CHECK64-NEXT:    mov x9, sp
@@ -4385,22 +4658,23 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    sub x8, x29, #64
 ; CHECK64-NEXT:    movk w0, #59491, lsl #16
 ; CHECK64-NEXT:    addvl sp, x8, #-18
-; CHECK64-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK64-NEXT:    addvl sp, x29, #-2
 ; CHECK64-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
@@ -4413,131 +4687,243 @@ define i32 @svecc_call_dynamic_and_scalable_alloca(<4 x i16> %P0, i32 %P1, i32 %
 ; CHECK64-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
 ; CHECK64-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT:    sub sp, x29, #64
-; CHECK64-NEXT:    ldp x20, x19, [sp, #104] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldr x29, [sp, #64] // 8-byte Folded Reload
-; CHECK64-NEXT:    ldp x27, x26, [sp, #88] // 16-byte Folded Reload
-; CHECK64-NEXT:    ldp x30, x28, [sp, #72] // 16-byte Folded Reload
-; CHECK64-NEXT:    add sp, sp, #128
+; CHECK64-NEXT:    mov sp, x29
+; CHECK64-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK64-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK64-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
 ; CHECK64-NEXT:    ret
 ;
-; CHECK1024-LABEL: svecc_call_dynamic_and_scalable_alloca:
-; CHECK1024:       // %bb.0: // %entry
-; CHECK1024-NEXT:    sub sp, sp, #1088
-; CHECK1024-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT:    add x29, sp, #1024
-; CHECK1024-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT:    addvl sp, sp, #-18
-; CHECK1024-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT:    sub sp, sp, #1072
-; CHECK1024-NEXT:    addvl sp, sp, #-1
-; CHECK1024-NEXT:    mov x19, sp
-; CHECK1024-NEXT:    .cfi_def_cfa w29, 64
-; CHECK1024-NEXT:    .cfi_offset w19, -16
-; CHECK1024-NEXT:    .cfi_offset w20, -24
-; CHECK1024-NEXT:    .cfi_offset w26, -32
-; CHECK1024-NEXT:    .cfi_offset w27, -40
-; CHECK1024-NEXT:    .cfi_offset w28, -48
-; CHECK1024-NEXT:    .cfi_offset w30, -56
-; CHECK1024-NEXT:    .cfi_offset w29, -64
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
-; CHECK1024-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
-; CHECK1024-NEXT:    // kill: def $w0 killed $w0 def $x0
-; CHECK1024-NEXT:    ubfiz x8, x0, #2, #32
-; CHECK1024-NEXT:    mov x9, sp
-; CHECK1024-NEXT:    add x8, x8, #15
-; CHECK1024-NEXT:    and x8, x8, #0x7fffffff0
-; CHECK1024-NEXT:    sub x20, x9, x8
-; CHECK1024-NEXT:    mov sp, x20
-; CHECK1024-NEXT:    //APP
-; CHECK1024-NEXT:    //NO_APP
-; CHECK1024-NEXT:    add x0, x19, #8
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    sub x0, x29, #1024
-; CHECK1024-NEXT:    addvl x0, x0, #-19
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    mov x0, x20
-; CHECK1024-NEXT:    bl bar
-; CHECK1024-NEXT:    mov w0, #22647 // =0x5877
-; CHECK1024-NEXT:    sub x8, x29, #1024
-; CHECK1024-NEXT:    movk w0, #59491, lsl #16
-; CHECK1024-NEXT:    addvl sp, x8, #-18
-; CHECK1024-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT:    sub sp, x29, #1024
-; CHECK1024-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT:    add sp, sp, #1088
-; CHECK1024-NEXT:    ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024-NOSPLITSVE:       // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x27, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x26, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x20, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-NOSPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w20, -24
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w26, -32
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w27, -40
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w28, -48
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * VG - 1088
+; CHECK1024-NOSPLITSVE-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK1024-NOSPLITSVE-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK1024-NOSPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-NOSPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-NOSPLITSVE-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK1024-NOSPLITSVE-NEXT:    sub x20, x9, x8
+; CHECK1024-NOSPLITSVE-NEXT:    mov sp, x20
+; CHECK1024-NOSPLITSVE-NEXT:    //APP
+; CHECK1024-NOSPLITSVE-NEXT:    //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT:    add x0, x19, #8
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    sub x0, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    addvl x0, x0, #-19
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    mov x0, x20
+; CHECK1024-NOSPLITSVE-NEXT:    bl bar
+; CHECK1024-NOSPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    sub sp, x29, #1024
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x20, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x26, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x27, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x28, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT:    add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT:    ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call_dynamic_and_scalable_alloca:
+; CHECK1024-SPLITSVE:       // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT:    stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    mov x29, sp
+; CHECK1024-SPLITSVE-NEXT:    stp x27, x26, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT:    str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT:    str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT:    sub sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, sp, #-1
+; CHECK1024-SPLITSVE-NEXT:    mov x19, sp
+; CHECK1024-SPLITSVE-NEXT:    .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w20, -16
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w26, -24
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w27, -32
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w28, -48
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT:    .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4d, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * VG - 1088
+; CHECK1024-SPLITSVE-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK1024-SPLITSVE-NEXT:    ubfiz x8, x0, #2, #32
+; CHECK1024-SPLITSVE-NEXT:    mov x9, sp
+; CHECK1024-SPLITSVE-NEXT:    add x8, x8, #15
+; CHECK1024-SPLITSVE-NEXT:    and x8, x8, #0x7fffffff0
+; CHECK1024-SPLITSVE-NEXT:    sub x20, x9, x8
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x20
+; CHECK1024-SPLITSVE-NEXT:    //APP
+; CHECK1024-SPLITSVE-NEXT:    //NO_APP
+; CHECK1024-SPLITSVE-NEXT:    add x0, x19, #8
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    sub x0, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    addvl x0, x0, #-19
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    mov x0, x20
+; CHECK1024-SPLITSVE-NEXT:    bl bar
+; CHECK1024-SPLITSVE-NEXT:    mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT:    sub x8, x29, #1024
+; CHECK1024-SPLITSVE-NEXT:    movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x8, #-18
+; CHECK1024-SPLITSVE-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    addvl sp, x29, #-2
+; CHECK1024-SPLITSVE-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    mov sp, x29
+; CHECK1024-SPLITSVE-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x27, x26, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT:    ret
 entry:
   %a = alloca i32, i32 10
   %b = alloca <vscale x 4 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
index e411c23c77bbe..7b5621ff3b5a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll
@@ -27,11 +27,11 @@ define amdgpu_kernel void @set_inactive_imm_poison(ptr addrspace(1) %out) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call i32 @llvm.amdgcn.set.inactive.i32(i32 1, i32 poison) #0
   %tmp = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp.0)
@@ -68,12 +68,12 @@ define amdgpu_kernel void @set_inactive_imm_poison_64(ptr addrspace(1) %out) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[0:3], 0
 ; GCN-NEXT:    s_endpgm
   %tmp.0 = call i64 @llvm.amdgcn.set.inactive.i64(i64 1, i64 poison) #0
   %tmp = call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp.0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index e86f7473363f7..37b5422be7e2f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
 
 ; Note: we use MIR test checks + stop after legalizer to prevent
 ; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
index 44b12a9f6fe81..61a61376d7ddd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
 
 declare void @readsMem(ptr) #0
 declare void @writesMem(ptr) #1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 1cd9c0bfeb7e6..2351c969d5e49 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -165,10 +165,10 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX10-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
+; GFX10-NEXT:    global_load_dword v3, v0, s[6:7]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v1, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s2, v2, v3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -179,15 +179,15 @@ define amdgpu_kernel void @v_mul_i64_zext_src0_src1(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-NEXT:    global_load_b32 v0, v0, s[4:5]
+; GFX11-NEXT:    global_load_b32 v2, v0, s[2:3]
+; GFX11-NEXT:    global_load_b32 v3, v0, s[4:5]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v3, 0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 637aaf7529364..7f10ee4c17450 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -546,10 +546,11 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v4, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, v1
-; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GCN-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GCN-NEXT:    v_mov_b32_e32 v5, v2
+; GCN-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GCN-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i64:
@@ -742,10 +743,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
 ; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v4, v[0:1]
 ; GCN-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v3, 0
-; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v3, v[8:9]
-; GCN-NEXT:    v_mov_b32_e32 v2, v8
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v4, v[1:2]
-; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[1:2]
+; GCN-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v2, v3, v[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v2, v10
+; GCN-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v4, v[1:2]
+; GCN-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v3, v[8:9]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i96:
@@ -758,8 +759,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v6, v3, 0
 ; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v8
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[8:9], s4, v6, v4, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v7, v3, v[8:9]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_mul_i96:
@@ -771,8 +772,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v6, v3, 0
 ; GFX11-NEXT:    v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
 ; GFX11-NEXT:    v_mov_b32_e32 v2, v9
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], null, v7, v3, v[8:9]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-LABEL: v_mul_i96:
@@ -791,8 +792,8 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v2, v3, v[8:9]
 ; GFX12-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v6, v4, v[1:2]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[8:9], null, v6, v4, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], null, v7, v3, v[8:9]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1250-LABEL: v_mul_i96:
@@ -808,10 +809,10 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
 ; GFX1250-NEXT:    v_mad_u32 v9, v2, v3, v5
 ; GFX1250-NEXT:    v_mov_b32_e32 v8, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[4:5], v6, v4, v[8:9]
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], v7, v3, v[4:5]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
+; GFX1250-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %result = mul i96 %num, %den
   ret i96 %result
@@ -1071,18 +1072,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX7-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v10, v2
+; GFX7-NEXT:    v_mov_b32_e32 v11, v3
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v12, v4
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
 ; GFX7-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
 ; GFX7-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX7-NEXT:    v_mov_b32_e32 v2, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX7-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v13
+; GFX7-NEXT:    v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i128:
@@ -1092,18 +1095,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v2
+; GFX8-NEXT:    v_mov_b32_e32 v11, v3
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v12, v4
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX8-NEXT:    v_mov_b32_e32 v2, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v7, v6, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i128:
@@ -1113,18 +1118,20 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX9-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v10, v2
+; GFX9-NEXT:    v_mov_b32_e32 v11, v3
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v5, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v12, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v10, v12, v[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v5, v[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v4, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v4, v[11:12]
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v9, v6
-; GFX9-NEXT:    v_mov_b32_e32 v2, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], vcc, v8, v5, v[1:2]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v4, v[1:2]
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[4:5], v12, v7, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v6, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v5, v[6:7]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v3, v4, v[5:6]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], vcc, v8, v5, v[1:2]
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[3:4]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[4:5], v14, v7, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i128:
@@ -1138,11 +1145,11 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v9, v6
 ; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v9, v5, v[0:1]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v8, v4, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s4, v10, v4, v[11:12]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v11
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[1:2]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v12, v7, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[13:14], s4, v10, v4, v[11:12]
+; GFX10-NEXT:    v_mov_b32_e32 v2, v13
+; GFX10-NEXT:    v_mad_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s4, v9, v4, v[11:12]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v7, s4, v14, v7, s4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v7, v6, vcc_lo
 ; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s4, v10, v5, v[6:7]
 ; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v4, v[5:6]
@@ -1155,15 +1162,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v12, v3
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v6, 0
-; GFX11-NEXT:    v_mul_lo_u32 v4, v9, v6
-; GFX11-NEXT:    v_mul_lo_u32 v6, v8, v7
+; GFX11-NEXT:    v_mul_lo_u32 v7, v8, v7
+; GFX11-NEXT:    v_mul_lo_u32 v6, v9, v6
 ; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v9, v5, v[0:1]
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v8, v11, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v10, v11, v[2:3]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v11, v[1:2]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v6, s0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[13:14], null, v10, v11, v[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v2, v13
+; GFX11-NEXT:    v_mad_u64_u32 v[3:4], vcc_lo, v8, v5, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s0, v9, v11, v[3:4]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v14, v7, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
 ; GFX11-NEXT:    v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
 ; GFX11-NEXT:    v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -1184,14 +1192,14 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v9, v5, v[0:1]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], null, v10, v4, v[11:12]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[13:14], null, v10, v4, v[11:12]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v2, v11
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], vcc_lo, v8, v5, v[1:2]
+; GFX12-NEXT:    v_mov_b32_e32 v2, v13
+; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], vcc_lo, v8, v5, v[1:2]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[11:12]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v7, null, v12, v7, s0
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v7, null, v14, v7, s0
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, v7, v6, vcc_lo
@@ -1210,16 +1218,16 @@ define i128 @v_mul_i128(i128 %num, i128 %den) {
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v8, v4, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[10:11], v2, v4, v[10:11]
-; GFX1250-NEXT:    v_mov_b32_e32 v12, v1
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11]
+; GFX1250-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX1250-NEXT:    v_mul_lo_u32 v1, v9, v6
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mov_b32_e32 v13, v10
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[12:13]
+; GFX1250-NEXT:    v_mov_b32_e32 v11, v12
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11]
 ; GFX1250-NEXT:    v_mul_lo_u32 v8, v8, v7
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[12:13]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v11, v8, s0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v8, null, v13, v8, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
 ; GFX1250-NEXT:    v_mad_u32 v1, v2, v5, v1
@@ -2401,207 +2409,216 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT:    v_mul_lo_u32 v28, v3, v12
+; GFX7-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
 ; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX7-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX7-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX7-NEXT:    v_mov_b32_e32 v22, v18
-; GFX7-NEXT:    v_mov_b32_e32 v18, v19
-; GFX7-NEXT:    v_mov_b32_e32 v19, v16
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX7-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX7-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX7-NEXT:    v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX7-NEXT:    v_mov_b32_e32 v21, v22
+; GFX7-NEXT:    v_mov_b32_e32 v22, v23
+; GFX7-NEXT:    v_mov_b32_e32 v23, v18
+; GFX7-NEXT:    v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX7-NEXT:    v_mul_lo_u32 v18, v6, v9
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX7-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX7-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX7-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX7-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX7-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX7-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX7-NEXT:    v_mov_b32_e32 v20, v23
+; GFX7-NEXT:    v_mul_lo_u32 v25, v4, v11
+; GFX7-NEXT:    v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX7-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX7-NEXT:    v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
 ; GFX7-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX7-NEXT:    v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX7-NEXT:    v_mov_b32_e32 v21, v20
-; GFX7-NEXT:    v_mov_b32_e32 v20, v11
-; GFX7-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX7-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX7-NEXT:    v_mov_b32_e32 v12, v22
+; GFX7-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX7-NEXT:    v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX7-NEXT:    v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX7-NEXT:    v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
 ; GFX7-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX7-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX7-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX7-NEXT:    v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX7-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX7-NEXT:    v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX7-NEXT:    v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
 ; GFX7-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX7-NEXT:    v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
 ; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX7-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX7-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX7-NEXT:    v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX7-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
 ; GFX7-NEXT:    v_mov_b32_e32 v0, v10
+; GFX7-NEXT:    v_mov_b32_e32 v1, v13
+; GFX7-NEXT:    v_mov_b32_e32 v2, v14
+; GFX7-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_i256:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT:    v_mul_lo_u32 v28, v3, v12
+; GFX8-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
 ; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX8-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, 0, v22, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v22, v18
-; GFX8-NEXT:    v_mov_b32_e32 v18, v19
-; GFX8-NEXT:    v_mov_b32_e32 v19, v16
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX8-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, 0, v24, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, 0, v24, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v21, v22
+; GFX8-NEXT:    v_mov_b32_e32 v22, v23
+; GFX8-NEXT:    v_mov_b32_e32 v23, v18
+; GFX8-NEXT:    v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX8-NEXT:    v_mul_lo_u32 v18, v6, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX8-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX8-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX8-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX8-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX8-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX8-NEXT:    v_mov_b32_e32 v20, v23
+; GFX8-NEXT:    v_mul_lo_u32 v25, v4, v11
+; GFX8-NEXT:    v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX8-NEXT:    v_addc_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
 ; GFX8-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX8-NEXT:    v_addc_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX8-NEXT:    v_mov_b32_e32 v21, v20
-; GFX8-NEXT:    v_mov_b32_e32 v20, v11
-; GFX8-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX8-NEXT:    v_mov_b32_e32 v12, v22
+; GFX8-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX8-NEXT:    v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX8-NEXT:    v_addc_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX8-NEXT:    v_addc_u32_e64 v16, s[10:11], 0, v13, s[10:11]
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX8-NEXT:    v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX8-NEXT:    v_addc_u32_e64 v3, s[10:11], v3, v4, s[10:11]
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v19, v0, s[10:11]
 ; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v16, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX8-NEXT:    v_addc_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX8-NEXT:    v_addc_u32_e32 v0, vcc, v0, v18, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v10
+; GFX8-NEXT:    v_mov_b32_e32 v1, v13
+; GFX8-NEXT:    v_mov_b32_e32 v2, v14
+; GFX8-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_mul_i256:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v0, v12, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT:    v_mul_lo_u32 v28, v3, v12
+; GFX9-NEXT:    v_mul_lo_u32 v27, v5, v10
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v12, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[20:21]
 ; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v0, v10, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v11, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v3, v11, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v2, v10, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v4, v10, v[16:17]
-; GFX9-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[8:9], v5, v9, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[18:19]
-; GFX9-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v4, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v6, v8, v[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[6:7], v1, v9, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e32 v23, vcc, 0, v22, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v22, v18
-; GFX9-NEXT:    v_mov_b32_e32 v18, v19
-; GFX9-NEXT:    v_mov_b32_e32 v19, v16
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v0, v13, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v16, v6, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v4, v10, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v1, v9, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[8:9], v5, v9, v[22:23]
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], vcc, v2, v10, v[18:19]
+; GFX9-NEXT:    v_cndmask_b32_e64 v24, 0, 1, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], vcc, v3, v9, v[22:23]
+; GFX9-NEXT:    v_addc_co_u32_e32 v24, vcc, 0, v24, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], vcc, v4, v8, v[18:19]
+; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v6, v8, v[20:21]
+; GFX9-NEXT:    v_addc_co_u32_e32 v26, vcc, 0, v24, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v21, v22
+; GFX9-NEXT:    v_mov_b32_e32 v22, v23
+; GFX9-NEXT:    v_mov_b32_e32 v23, v18
+; GFX9-NEXT:    v_mad_u64_u32 v[24:25], vcc, v0, v13, v[22:23]
+; GFX9-NEXT:    v_mul_lo_u32 v18, v6, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[6:7]
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v8, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v24, s[4:5], 0, v6, s[4:5]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v1, v12, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[21:22], s[10:11], v0, v11, v[21:22]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v2, v11, v[18:19]
-; GFX9-NEXT:    v_mul_lo_u32 v26, v4, v11
-; GFX9-NEXT:    v_mul_lo_u32 v27, v3, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[18:19]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v1, v10, v[21:22]
-; GFX9-NEXT:    v_mul_lo_u32 v25, v5, v10
-; GFX9-NEXT:    v_mul_lo_u32 v28, v2, v13
-; GFX9-NEXT:    v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[11:12]
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v2, v8, v[16:17]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[4:5], 0, v6, s[4:5]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v12, v[24:25]
+; GFX9-NEXT:    v_mov_b32_e32 v20, v23
+; GFX9-NEXT:    v_mul_lo_u32 v25, v4, v11
+; GFX9-NEXT:    v_mad_u64_u32 v[23:24], s[6:7], v2, v11, v[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v0, v11, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[8:9], v3, v10, v[23:24]
+; GFX9-NEXT:    v_cndmask_b32_e64 v23, 0, 1, s[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[10:11], v1, v10, v[16:17]
+; GFX9-NEXT:    v_addc_co_u32_e64 v24, s[10:11], 0, v23, s[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[16:17], s[10:11], v2, v9, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[12:13], v4, v9, v[11:12]
 ; GFX9-NEXT:    v_mad_u64_u32 v[10:11], s[14:15], v0, v8, 0
-; GFX9-NEXT:    v_addc_co_u32_e64 v22, s[10:11], 0, v6, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[18:19], s[10:11], v2, v9, v[18:19]
-; GFX9-NEXT:    v_mov_b32_e32 v21, v20
-; GFX9-NEXT:    v_mov_b32_e32 v20, v11
-; GFX9-NEXT:    v_mad_u64_u32 v[20:21], s[16:17], v0, v9, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, s[10:11], 0, v22, s[10:11]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[10:11], v3, v8, v[18:19]
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[14:15], v5, v8, v[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[10:11], 0, v2, s[10:11]
+; GFX9-NEXT:    v_mov_b32_e32 v12, v22
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[22:23], s[16:17], v0, v9, v[11:12]
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[10:11], 0, v24, s[10:11]
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[14:15], v5, v8, v[20:21]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[10:11], v3, v8, v[16:17]
+; GFX9-NEXT:    v_addc_co_u32_e64 v16, s[10:11], 0, v13, s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v1, v14
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[16:17]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v1, v8, v[20:21]
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v12, v3, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[16:17]
+; GFX9-NEXT:    v_mad_u64_u32 v[13:14], s[10:11], v1, v8, v[22:23]
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[10:11], v3, v4, s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, v15
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[10:11], v24, v4, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[10:11], v11, v5, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v23, v6, s[10:11]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v17, v0, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[10:11], v6, v5, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[10:11], v16, v11, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v26, v12, s[10:11]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v19, v0, s[10:11]
 ; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v9, s[14:15]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v28, s[12:13]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v27, s[8:9]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v26, s[6:7]
-; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v25, s[4:5]
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v16, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v7, v8, v[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[10:11], v0, v2, s[12:13]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[8:9], v0, v28, s[8:9]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[6:7], v0, v25, s[6:7]
+; GFX9-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v27, s[4:5]
+; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v18, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v7, v8, v[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v10
+; GFX9-NEXT:    v_mov_b32_e32 v1, v13
+; GFX9-NEXT:    v_mov_b32_e32 v2, v14
+; GFX9-NEXT:    v_mov_b32_e32 v7, v11
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_mul_i256:
@@ -2609,68 +2626,69 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v16, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v17, v1
-; GFX10-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX10-NEXT:    v_mul_lo_u32 v28, v5, v10
+; GFX10-NEXT:    v_mul_lo_u32 v29, v4, v11
+; GFX10-NEXT:    v_mul_lo_u32 v31, v3, v12
+; GFX10-NEXT:    v_mul_lo_u32 v30, v2, v13
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v14, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v12, 0
-; GFX10-NEXT:    v_mul_lo_u32 v30, v17, v14
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v13, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v2, v12, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[18:19]
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s4
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v20, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v16, v10, 0
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v4, v10, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v24, vcc_lo, 0, v22, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v5, v9, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v24, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[22:23], s4, v6, v8, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[20:21]
-; GFX10-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s4
-; GFX10-NEXT:    v_mov_b32_e32 v20, v22
-; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v29, vcc_lo, 0, v25, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[19:20]
-; GFX10-NEXT:    v_mov_b32_e32 v20, v18
+; GFX10-NEXT:    v_mul_lo_u32 v28, v17, v14
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v13, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v12, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v2, v12, v[18:19]
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v17, v11, v[0:1]
+; GFX10-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s4
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v3, v11, v[20:21]
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v22, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v4, v10, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v22, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], s4, v5, v9, v[18:19]
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s4, v16, v10, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v26, vcc_lo, 0, v26, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[24:25], s4, v6, v8, v[20:21]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v17, v9, v[18:19]
+; GFX10-NEXT:    v_mov_b32_e32 v18, v23
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s4
+; GFX10-NEXT:    v_mul_lo_u32 v23, v6, v9
+; GFX10-NEXT:    v_mov_b32_e32 v19, v24
+; GFX10-NEXT:    v_mul_lo_u32 v24, v5, v10
+; GFX10-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, 0, v27, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s4, v16, v13, v[18:19]
 ; GFX10-NEXT:    v_mov_b32_e32 v19, v22
-; GFX10-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX10-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
-; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[19:20]
+; GFX10-NEXT:    v_mul_lo_u32 v27, v16, v15
+; GFX10-NEXT:    v_mov_b32_e32 v18, v21
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s6, v16, v11, v[18:19]
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, v16, v8, 0
-; GFX10-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[24:25]
-; GFX10-NEXT:    v_mul_lo_u32 v25, v3, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s6
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s5, v2, v11, v[21:22]
 ; GFX10-NEXT:    v_mad_u64_u32 v[11:12], s6, v17, v10, v[14:15]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT:    v_mul_lo_u32 v24, v2, v13
-; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s7, v3, v10, v[18:19]
 ; GFX10-NEXT:    v_mov_b32_e32 v13, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v2, v9, v[11:12]
-; GFX10-NEXT:    v_mov_b32_e32 v14, v21
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s6, 0, v6, s6
-; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s6, v4, v9, v[18:19]
-; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v16, v9, v[13:14]
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s8
-; GFX10-NEXT:    v_mad_u64_u32 v[3:4], s8, v3, v8, v[1:2]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s8, 0, v6, s8
-; GFX10-NEXT:    v_mad_u64_u32 v[5:6], s8, v5, v8, v[10:11]
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[12:13]
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v9, v3, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v29, v4, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v14, v5, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v6, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v23, v22, s9
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v30, s8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v24, s6
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v25, s7
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v20, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v28, vcc_lo
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v27, s4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s6, 0, v32, s6
+; GFX10-NEXT:    v_mov_b32_e32 v14, v20
+; GFX10-NEXT:    v_mad_u64_u32 v[21:22], s7, v3, v10, v[18:19]
+; GFX10-NEXT:    v_mad_u64_u32 v[18:19], s6, v2, v9, v[11:12]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s6, 0, v1, s6
+; GFX10-NEXT:    v_mad_u64_u32 v[10:11], s8, v16, v9, v[13:14]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s6, v4, v9, v[21:22]
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s8
+; GFX10-NEXT:    v_mad_u64_u32 v[12:13], s8, v3, v8, v[18:19]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, 0, v15, s8
+; GFX10-NEXT:    v_mad_u64_u32 v[14:15], s8, v5, v8, v[1:2]
+; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s9, v4, v12, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s9, v6, v13, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v5, s9, v9, v14, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s9, v26, v15, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s9, v25, v27, s9
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s8, v9, v28, s8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v30, s6
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s6, v9, v31, s7
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s5, v9, v29, s5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v9, v24, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, vcc_lo, v9, v23, s4
 ; GFX10-NEXT:    v_mad_u64_u32 v[7:8], s4, v7, v8, v[9:10]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2681,66 +2699,65 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX11-NEXT:    v_dual_mov_b32 v18, v8 :: v_dual_mov_b32 v19, v7
 ; GFX11-NEXT:    v_mul_lo_u32 v30, v4, v11
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v14, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v16, v12, 0
+; GFX11-NEXT:    v_mul_lo_u32 v28, v16, v15
 ; GFX11-NEXT:    v_mul_lo_u32 v29, v17, v14
-; GFX11-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], s0, v17, v11, v[7:8]
-; GFX11-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v2, v10, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v16, v10, 0
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v3, v9, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[7:8], vcc_lo, v4, v18, v[7:8]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v27, null, 0, v24, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[22:23], null, v6, v18, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX11-NEXT:    v_mov_b32_e32 v20, v8
-; GFX11-NEXT:    v_cndmask_b32_e64 v26, 0, 1, s0
-; GFX11-NEXT:    v_mov_b32_e32 v21, v22
-; GFX11-NEXT:    v_mul_lo_u32 v22, v6, v9
-; GFX11-NEXT:    v_mad_u64_u32 v[24:25], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT:    v_mul_lo_u32 v32, v3, v12
+; GFX11-NEXT:    v_mul_lo_u32 v31, v2, v13
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v17, v13, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v12, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v2, v12, v[7:8]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], s0, v17, v11, v[0:1]
+; GFX11-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v2, v10, v[7:8]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v4, v10, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], null, v5, v9, v[7:8]
+; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v16, v10, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[22:23], vcc_lo, v4, v18, v[0:1]
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[20:21]
-; GFX11-NEXT:    v_mov_b32_e32 v6, v25
-; GFX11-NEXT:    v_mul_lo_u32 v25, v16, v15
-; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v17, v12, v[0:1]
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v16, v11, v[6:7]
+; GFX11-NEXT:    v_mad_u64_u32 v[24:25], null, v6, v18, v[20:21]
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v17, v9, v[7:8]
+; GFX11-NEXT:    v_mov_b32_e32 v7, v23
+; GFX11-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX11-NEXT:    v_mul_lo_u32 v23, v6, v9
+; GFX11-NEXT:    v_mov_b32_e32 v8, v24
+; GFX11-NEXT:    v_mul_lo_u32 v24, v5, v10
+; GFX11-NEXT:    v_mad_u64_u32 v[20:21], vcc_lo, v2, v18, v[0:1]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v27, null, 0, v27, vcc_lo
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], s0, v16, v13, v[7:8]
+; GFX11-NEXT:    v_dual_mov_b32 v7, v22 :: v_dual_mov_b32 v6, v21
+; GFX11-NEXT:    v_mad_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s2, v16, v11, v[6:7]
 ; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v16, v18, 0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[14:15], s1, v2, v11, v[20:21]
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v17, v10, v[6:7]
-; GFX11-NEXT:    v_mul_lo_u32 v20, v2, v13
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v8, null, 0, v8, s2
-; GFX11-NEXT:    v_mov_b32_e32 v11, v1
-; GFX11-NEXT:    v_mad_u64_u32 v[13:14], s3, v3, v10, v[14:15]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v2, v9, v[6:7]
-; GFX11-NEXT:    v_mul_lo_u32 v21, v3, v12
-; GFX11-NEXT:    v_mov_b32_e32 v12, v24
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v8, s2
-; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v4, v9, v[13:14]
-; GFX11-NEXT:    v_mad_u64_u32 v[8:9], s4, v16, v9, v[11:12]
-; GFX11-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[3:4], s4, v3, v18, v[1:2]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v10, null, 0, v10, s4
-; GFX11-NEXT:    v_mad_u64_u32 v[5:6], s4, v5, v18, v[6:7]
-; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v18, v[8:9]
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v11, v3, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v26, v4, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v10, v5, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v27, v6, s5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v23, v25, s5
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s1, v2, v11, v[21:22]
+; GFX11-NEXT:    v_mad_u64_u32 v[11:12], s2, v17, v10, v[14:15]
+; GFX11-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v8, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[21:22], s3, v3, v10, v[6:7]
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s2, v2, v9, v[11:12]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v1, s2
+; GFX11-NEXT:    v_mad_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s2, v4, v9, v[21:22]
+; GFX11-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[8:9], s4, v3, v18, v[6:7]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v12, null, 0, v12, s4
+; GFX11-NEXT:    v_mad_u64_u32 v[6:7], s4, v5, v18, v[1:2]
+; GFX11-NEXT:    v_mad_u64_u32 v[1:2], s5, v17, v18, v[10:11]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s5, v4, v8, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s5, v27, v9, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v5, s5, v12, v6, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v7, s5
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v25, v28, s5
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v29, s4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v20, s2
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v21, s3
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v31, s2
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v32, s3
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v30, s1
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v28, vcc_lo
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, v7, v22, s0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v7, null, v7, v24, vcc_lo
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v9, null, v7, v23, s0
 ; GFX11-NEXT:    v_mad_u64_u32 v[7:8], null, v19, v18, v[9:10]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2752,101 +2769,103 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX12-NEXT:    v_mul_lo_u32 v27, v6, v9
-; GFX12-NEXT:    v_mul_lo_u32 v28, v5, v10
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_mul_lo_u32 v29, v4, v11
+; GFX12-NEXT:    v_mul_lo_u32 v31, v3, v12
+; GFX12-NEXT:    v_mul_lo_u32 v30, v2, v13
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v14, 0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v12, 0
-; GFX12-NEXT:    v_mul_lo_u32 v30, v17, v14
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v17, v13, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
+; GFX12-NEXT:    v_mul_lo_u32 v28, v17, v14
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v17, v13, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v12, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v2, v12, v[18:19]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v2, v12, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v16, v10, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v4, v10, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v3, v11, v[20:21]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
 ; GFX12-NEXT:    s_wait_alu 0xfffd
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v5, v9, v[0:1]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], null, v6, v8, v[0:1]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v25, 0, 1, s0
-; GFX12-NEXT:    v_mov_b32_e32 v20, v22
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v4, v10, v[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
 ; GFX12-NEXT:    s_wait_alu 0xfffd
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v29, null, 0, v25, vcc_lo
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[19:20]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], null, v5, v9, v[18:19]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], null, v16, v10, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v26, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], null, v6, v8, v[20:21]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_mov_b32_e32 v18, v23
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    v_cndmask_b32_e64 v27, 0, 1, s0
+; GFX12-NEXT:    v_mul_lo_u32 v23, v6, v9
+; GFX12-NEXT:    v_mov_b32_e32 v19, v24
+; GFX12-NEXT:    v_mul_lo_u32 v24, v5, v10
+; GFX12-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v8, v[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v27, vcc_lo
+; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v16, v13, v[18:19]
 ; GFX12-NEXT:    v_mov_b32_e32 v19, v22
-; GFX12-NEXT:    v_mul_lo_u32 v22, v16, v15
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[24:25], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT:    v_mul_lo_u32 v27, v16, v15
+; GFX12-NEXT:    v_mov_b32_e32 v18, v21
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], vcc_lo, v17, v12, v[0:1]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[18:19]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
-; GFX12-NEXT:    v_mov_b32_e32 v20, v18
-; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v16, v11, v[19:20]
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[24:25]
-; GFX12-NEXT:    v_mul_lo_u32 v20, v4, v11
-; GFX12-NEXT:    v_mul_lo_u32 v25, v3, v12
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
-; GFX12-NEXT:    v_mul_lo_u32 v24, v2, v13
+; GFX12-NEXT:    v_cndmask_b32_e64 v32, 0, 1, s2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s1, v2, v11, v[21:22]
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[11:12], s2, v17, v10, v[14:15]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s3, v3, v10, v[18:19]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v20
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT:    v_dual_mov_b32 v13, v1 :: v_dual_mov_b32 v14, v21
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v32, s2
+; GFX12-NEXT:    v_mad_co_u64_u32 v[21:22], s3, v3, v10, v[18:19]
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s2, v2, v9, v[11:12]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v2, v9, v[11:12]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s2
-; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v4, v9, v[18:19]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], s4, v16, v9, v[13:14]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v15, null, 0, v1, s2
+; GFX12-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[13:14]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s2, v4, v9, v[21:22]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[3:4], s4, v3, v8, v[1:2]
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX12-NEXT:    v_mad_co_u64_u32 v[12:13], s4, v3, v8, v[18:19]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v14, null, 0, v6, s4
-; GFX12-NEXT:    v_mad_co_u64_u32 v[5:6], s4, v5, v8, v[10:11]
-; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[12:13]
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, 0, v15, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_mad_co_u64_u32 v[14:15], s4, v5, v8, v[1:2]
+; GFX12-NEXT:    v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11]
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v9, v3, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v3, s5, v4, v12, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v29, v4, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v4, s5, v6, v13, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v14, v5, s5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v5, s5, v9, v14, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v6, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v6, s5, v26, v15, s5
 ; GFX12-NEXT:    s_wait_alu 0xf1ff
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v23, v22, s5
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v30, s4
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v24, s2
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v25, v27, s5
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v28, s4
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v25, s3
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v20, s1
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v30, s2
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v31, s3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v29, s1
 ; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v24, vcc_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v28, vcc_lo
-; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v27, s0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_co_ci_u32_e64 v9, null, v9, v23, s0
 ; GFX12-NEXT:    v_mad_co_u64_u32 v[7:8], null, v7, v8, v[9:10]
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2855,87 +2874,89 @@ define i256 @v_mul_i256(i256 %num, i256 %den) {
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
-; GFX1250-NEXT:    v_mul_lo_u32 v27, v5, v10
-; GFX1250-NEXT:    v_mul_lo_u32 v29, v3, v12
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mul_lo_u32 v30, v4, v11
+; GFX1250-NEXT:    v_mul_lo_u32 v29, v5, v10
+; GFX1250-NEXT:    v_mul_lo_u32 v31, v3, v12
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v14, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v16, v12, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v17, v13, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[18:19]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v20, 0, 1, s0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v2, v12, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v2, v10, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v20, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v16, v10, 0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v3, v11, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v3, v9, v[18:19]
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v24, null, 0, v22, vcc_lo
+; GFX1250-NEXT:    v_mul_lo_u32 v32, v2, v13
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v12, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_cndmask_b32_e64 v22, 0, 1, s0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v4, v10, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], vcc_lo, v4, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v24, vcc_lo
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v5, v9, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[22:23], v6, v8, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[20:21]
-; GFX1250-NEXT:    v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
-; GFX1250-NEXT:    v_mul_lo_u32 v22, v6, v9
-; GFX1250-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[18:19], v16, v10, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[24:25], s0, v2, v8, v[0:1]
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
+; GFX1250-NEXT:    v_mul_lo_u32 v24, v6, v9
+; GFX1250-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, null, 0, v6, s0
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_dual_mov_b32 v20, v25 :: v_dual_mov_b32 v21, v18
-; GFX1250-NEXT:    v_mul_lo_u32 v25, v4, v11
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v16, v11, v[20:21]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_cndmask_b32_e64 v28, 0, 1, s2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
+; GFX1250-NEXT:    v_mov_b32_e32 v13, v18
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21]
 ; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s2
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[0:1], v16, v8, 0
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s2, v17, v10, v[18:19]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s3, v3, v10, v[20:21]
-; GFX1250-NEXT:    v_mul_lo_u32 v20, v2, v13
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v21, null, 0, v28, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v2, v9, v[18:19]
-; GFX1250-NEXT:    v_dual_mov_b32 v18, v1 :: v_dual_mov_b32 v19, v24
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s4, v4, v9, v[10:11]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v21, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s6, v16, v9, v[18:19]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v33, null, 0, v11, s2
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mov_b32_e32 v12, v1
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v33, s2
 ; GFX1250-NEXT:    v_mul_lo_u32 v2, v16, v15
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[12:13]
-; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX1250-NEXT:    v_mul_lo_u32 v9, v17, v14
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[10:11]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s6
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, s2
-; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
+; GFX1250-NEXT:    v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v26, v11, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v6, s2, v28, v11, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v23, v2, s2
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v25, v2, s2
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, v15
 ; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v9, s5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v20, s4
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v29, s3
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v32, s4
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v31, s3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v25, s1
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v27, s0
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v30, s1
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v29, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v22, vcc_lo
+; GFX1250-NEXT:    v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo
 ; GFX1250-NEXT:    v_mad_u32 v7, v7, v8, v1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v14
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
@@ -2949,60 +2970,60 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
-; GFX7-NEXT:    buffer_load_dword v2, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX7-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
+; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v3, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
 ; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    flat_load_dword v2, v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX8-NEXT:    flat_load_dword v4, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    global_load_dword v2, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x50
+; GFX9-NEXT:    global_load_dword v4, v[2:3], off
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x50
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v3, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    global_load_dword v2, v[2:3], off
+; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v2, 0
+; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
 ; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX11-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
 ; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX12-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT:    v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
 ; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
@@ -3130,33 +3151,36 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
-; GFX7-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX7-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
-; GFX7-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX7-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v5, v[3:4]
+; GFX7-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
+; GFX7-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX7-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v6, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX7-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v4, v[2:3]
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX8-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dword v4, v[2:3], off
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x50
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0x50
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
-; GFX9-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], v4, v5, v[3:4]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v4
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v7, v6, v[3:4]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v4
 ; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -3183,17 +3207,17 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX12-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX12:       ; %bb.0:
-; GFX12-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX12-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v2, 0
+; GFX12-NEXT:    v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0
 ; GFX12-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-NEXT:    s_endpgm
 ;
 ; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    global_load_b32 v2, v[2:3], off
+; GFX1250-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
-; GFX1250-NEXT:    v_mad_nc_i64_i32 v[2:3], 0x50, v2, 0
+; GFX1250-NEXT:    v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
 ; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX1250-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4f2c454e13356..01c601f0646b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v3, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v11, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v3
 ; CHECK-NEXT:    v_trunc_f32_e32 v8, v6
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v8
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v9, v3
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v12, v8
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v14, v8
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v9, v7
-; CHECK-NEXT:    v_mul_lo_u32 v14, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT:    v_mul_hi_u32 v7, v11, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v6
+; CHECK-NEXT:    v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v11, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v13, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; CHECK-NEXT:    v_mul_hi_u32 v7, v12, v7
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_hi_u32 v8, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v3
-; CHECK-NEXT:    v_addc_u32_e32 v12, vcc, v12, v6, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v9, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v3
+; CHECK-NEXT:    v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v11, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v10, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v10
-; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v10, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v8, v3, v10
-; CHECK-NEXT:    v_mul_lo_u32 v3, v12, v6
-; CHECK-NEXT:    v_mul_lo_u32 v5, v9, v7
-; CHECK-NEXT:    v_xor_b32_e32 v11, v4, v10
-; CHECK-NEXT:    v_mul_hi_u32 v4, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v12, 31, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v12
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v12, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT:    v_mul_lo_u32 v3, v14, v6
+; CHECK-NEXT:    v_mul_lo_u32 v5, v11, v9
+; CHECK-NEXT:    v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT:    v_mul_hi_u32 v4, v11, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v6
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v12, v7
+; CHECK-NEXT:    v_mul_lo_u32 v4, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v5, v9, v7
+; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v9
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v7
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v11, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v11, v3
-; CHECK-NEXT:    v_mul_hi_u32 v9, v11, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v11, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v13, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v8, v4
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v7, 0
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v9, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v7, v[4:5]
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v11, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v11, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, v8, v9, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v8
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v9, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v9, v3, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v10, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v7, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -220,65 +220,65 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s13, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s12, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v4, s12, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s13, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s13, v1
+; CHECK-NEXT:    v_mov_b32_e32 v7, s13
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -291,39 +291,39 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v4, s13, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v2, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v5, s13
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v4, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v3, s11
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[0:1], s13, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s11
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[0:1], s13, v4
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s11, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s[0:1]
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v4, v5, s[0:1]
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s10, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v3
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    s_xor_b64 s[0:1], s[6:7], s[8:9]
 ; CHECK-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; CHECK-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
@@ -382,263 +382,263 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v5, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v10
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v4
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v10
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v10
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v4, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v9
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v9, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v9, v[5:6]
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v17, v14, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT:    v_mul_hi_u32 v13, v16, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
+; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v14
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v14
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v5
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v5
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v9, v11, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[5:6]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v9
-; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v12
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v9
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v9
+; GISEL-NEXT:    v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT:    v_mul_lo_u32 v5, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT:    v_mul_lo_u32 v1, v19, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v12
+; GISEL-NEXT:    v_mul_hi_u32 v5, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v12
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v16, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v5, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v1
+; GISEL-NEXT:    v_mul_lo_u32 v12, v18, v1
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v12, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT:    v_mul_hi_u32 v1, v18, v1
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v10, v0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v1, v5
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v1, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v10, v14, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v10, v16, v[1:2]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v7
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v7, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v0, v[12:13]
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v1, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v4, v0, v[12:13]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v7
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v6
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v6
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v12
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v11, v4, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, v11, v4, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v16, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT:    v_trunc_f32_e32 v15, v11
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v15
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, 0, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v15, v10
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v10
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v10
+; GISEL-NEXT:    v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v10
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v15, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v21, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v17, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v18, v12
+; GISEL-NEXT:    v_mul_lo_u32 v10, v18, v14
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v18, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v22, v11
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v15, v4
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v21, v4
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v17, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v15, v21, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v13, v15, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v10
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v17, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v15, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, v15, v21, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v18, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v17, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v22, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v18, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v14, v22, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v18, v10
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v16, v11, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v22, v11, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v19, v12, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v13, v[0:1]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v11, 31, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v15, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v11
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v11, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v11
-; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v16, v13, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v13, v9, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v13
+; GISEL-NEXT:    v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v12, v[8:9]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v15
+; GISEL-NEXT:    v_mul_lo_u32 v1, v14, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v11
+; GISEL-NEXT:    v_xor_b32_e32 v17, v2, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v12, v10
-; GISEL-NEXT:    v_xor_b32_e32 v9, v4, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v10
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v13, v0, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT:    v_xor_b32_e32 v10, v14, v8
+; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v14, v0, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v2, v17, v1
+; GISEL-NEXT:    v_mul_lo_u32 v3, v16, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v4, v13
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v1
+; GISEL-NEXT:    v_mul_hi_u32 v1, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v3, v16, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v1, v2
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v12, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v1, v2
+; GISEL-NEXT:    v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v1
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v9, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v12, v[3:4]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v15, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v11, v13
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v13, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v10, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v16, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v17, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v17, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v4, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v6
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v8, v9, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v6
+; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v10
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v6, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v6, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v12, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v4, v15, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -667,100 +667,100 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v3, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v16, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; CGP-NEXT:    v_trunc_f32_e32 v5, v4
 ; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v5
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v18, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v3
+; CGP-NEXT:    v_cvt_u32_f32_e32 v17, v5
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT:    v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT:    v_mul_hi_u32 v12, v14, v3
+; CGP-NEXT:    v_mul_lo_u32 v13, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT:    v_mul_lo_u32 v18, v17, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v14, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v18, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v4, v17, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v16, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v11
-; CGP-NEXT:    v_mul_hi_u32 v16, v12, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v12, v[4:5]
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v13
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v13, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v13
-; CGP-NEXT:    v_mul_lo_u32 v5, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v14, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_xor_b32_e32 v10, v10, v13
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v3
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v15, v14, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v16, v14, v[12:13]
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v15
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v15, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v5, v15
+; CGP-NEXT:    v_mul_lo_u32 v5, v17, v3
+; CGP-NEXT:    v_mul_lo_u32 v11, v14, v4
+; CGP-NEXT:    v_xor_b32_e32 v13, v10, v15
+; CGP-NEXT:    v_mul_hi_u32 v10, v14, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v17, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v16, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT:    v_mul_hi_u32 v14, v12, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v16, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v17, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_mul_hi_u32 v4, v17, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v10, v3
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v14, v11, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v10, v3
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v11, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v17, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v10, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v12, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v11, v13, v4
 ; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v14, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v5
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[4:5]
-; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v10, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v10, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v11, v5
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v2, v16, v[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v13, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v13, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
@@ -771,13 +771,13 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, v10, v11, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v14
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v16, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v15, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v10
 ; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v11, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
@@ -785,8 +785,8 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v15, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v16, v2, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -840,126 +840,126 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v5, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v7, v6
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v5
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v7
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v16, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v5
+; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT:    v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v5
+; CGP-NEXT:    v_mul_lo_u32 v11, v12, v6
+; CGP-NEXT:    v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT:    v_mul_lo_u32 v16, v15, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v16, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v6, v15, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v13, v6, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[6:7]
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v12, v10, v[6:7]
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v7, v11
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v11
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v5
+; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v6, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[6:7]
+; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v12, v[10:11]
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v13
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, v12, v6
+; CGP-NEXT:    v_xor_b32_e32 v14, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v15, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CGP-NEXT:    v_mul_hi_u32 v12, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_mul_lo_u32 v8, v15, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v12, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v6, v15, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v13, v6, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v5
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_mul_hi_u32 v10, v9, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v15, v6, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v14, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v11, v6
+; CGP-NEXT:    v_mul_hi_u32 v9, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v14, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v14, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_mul_hi_u32 v8, v11, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v14, v6
 ; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v12, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v7
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v12, v[6:7]
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v8, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v7
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v15, v[6:7]
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v12, v[7:8]
+; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v14, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v14, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v12
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v15, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v4, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
 ; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v11, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v10, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v13, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v15, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v2, v3, v5
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
@@ -1049,82 +1049,82 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1133,40 +1133,40 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, -1, v3, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v4
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v3
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 1235195
   ret i64 %result
@@ -1215,46 +1215,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v7, v[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v15, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v16
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v13
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v14
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v14
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v16
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v18, v16
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v17
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v14
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v15
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v16
 ; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v14, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v15, v17, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v19, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0x12d8fb
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
@@ -1263,46 +1263,46 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v19, v1
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT:    v_mul_hi_u32 v14, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v15, 0
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v19, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v18, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v7, v16, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v18, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v13
+; GISEL-NEXT:    v_add_i32_e32 v20, vcc, v15, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v7, v20, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v17, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v15, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v15
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v15
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v18
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v20, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v19, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v16
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v1, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v19
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v16, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1319,74 +1319,74 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v16, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v15, v13, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v20, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v18, v14, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v1, v12, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v12
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v9, v3, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_xor_b32_e32 v10, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v9, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v7, v11, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v6, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v11, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v9, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v9, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v9, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
@@ -1394,8 +1394,8 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v11, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
@@ -1406,12 +1406,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
@@ -1430,112 +1430,112 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
 ; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
 ; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v9, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT:    v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
 ; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT:    v_xor_b32_e32 v17, v1, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v17, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT:    v_mul_lo_u32 v16, v17, v1
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v19, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v13
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v18, v0
-; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v19, v13, vcc
-; CGP-NEXT:    v_sub_i32_e64 v13, s[4:5], v19, v13
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v18, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v18, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v17, v13, vcc
+; CGP-NEXT:    v_sub_i32_e64 v13, s[4:5], v17, v13
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, 1, v15
-; CGP-NEXT:    v_addc_u32_e32 v18, vcc, 0, v16, vcc
+; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, 1, v16
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, 0, v18, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v19, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v15, -1, v14, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT:    v_cndmask_b32_e64 v20, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v20, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v17
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v18, vcc
+; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v19, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v17, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
@@ -1553,72 +1553,72 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v16, v13, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v14, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v12, v18, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT:    v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v13, v9
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v8, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v10, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -1626,24 +1626,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v7
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -1679,126 +1679,126 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v5, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v2
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v12, vcc, 0, v2
+; CHECK-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; CHECK-NEXT:    v_trunc_f32_e32 v7, v6
 ; CHECK-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v5
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v12, v8, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_lo_u32 v7, v11, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v14, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; CHECK-NEXT:    v_mul_hi_u32 v12, v8, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v14, v7
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT:    v_mul_lo_u32 v6, v14, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT:    v_mul_hi_u32 v7, v11, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v11, v9
+; CHECK-NEXT:    v_mul_lo_u32 v10, v14, v9
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v11, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v6, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[6:7]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v7, v3, v9
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v12, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_hi_u32 v8, v14, v9
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v5
+; CHECK-NEXT:    v_addc_u32_e32 v14, vcc, v14, v6, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v12, v11, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v12, v14, v[6:7]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; CHECK-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[7:8]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v12, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v10, v3, v12
+; CHECK-NEXT:    v_mul_lo_u32 v3, v14, v5
+; CHECK-NEXT:    v_mul_lo_u32 v6, v11, v9
+; CHECK-NEXT:    v_xor_b32_e32 v13, v4, v12
+; CHECK-NEXT:    v_mul_hi_u32 v4, v11, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v14, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v4, v14, v9
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v9
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v14, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v11, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v12, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, v7, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v12, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v12, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v10, v4
+; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v12, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v13, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v7, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v3, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v8, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v7, v13, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v9, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v8, v[4:5]
-; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v12, v4, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v4
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v7, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v9, v[5:6]
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v7, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v7
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subb_u32_e32 v5, vcc, v5, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, 1, v8
-; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v11, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v11, v2, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v7
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v10, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v2, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 1, v6
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, 0, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v9, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v3, v12, v0
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v1, v3
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -1850,8 +1850,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v7, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v5
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v8
-; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v8
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v11
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v10
@@ -1859,182 +1859,183 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
 ; GISEL-NEXT:    v_trunc_f32_e32 v13, v11
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v13
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v19, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
-; GISEL-NEXT:    v_mul_lo_u32 v7, v17, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v7, v19, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v18, v7
-; GISEL-NEXT:    v_mul_hi_u32 v18, v14, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v19, v14
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v19, v14
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v7
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v11, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v7
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v16, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[7:8]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v11
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v0, v19, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v14
+; GISEL-NEXT:    v_xor_b32_e32 v18, v1, v7
+; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v19, v11
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT:    v_mul_lo_u32 v1, v19, v14
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v14
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v19, v14
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v0
-; GISEL-NEXT:    v_mul_lo_u32 v12, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v13, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v18, v0
+; GISEL-NEXT:    v_mul_lo_u32 v12, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v1
+; GISEL-NEXT:    v_mul_lo_u32 v13, v18, v1
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
+; GISEL-NEXT:    v_mul_hi_u32 v1, v18, v1
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v0, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v1, v13
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v1, v13
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v8, v14, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v8, v16, v[1:2]
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v6
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v5, v0, v[12:13]
 ; GISEL-NEXT:    v_xor_b32_e32 v10, v1, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v5, v0, v[12:13]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v13, v9
-; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, v15, v11
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v16, v12
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v13
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v9
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v17, v11
+; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v18, v14
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v12
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v16, v12, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, v11, v5, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v18, v14, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, v11, v5, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v16, v11
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v16
+; GISEL-NEXT:    v_trunc_f32_e32 v15, v11
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v15
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v1
 ; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, 0, v10
 ; GISEL-NEXT:    v_subb_u32_e32 v20, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v19, v18, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v16
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v15, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v8
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v22, v15
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v8
+; GISEL-NEXT:    v_subbrev_u32_e32 v21, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v8
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v16, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v15, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v20, v18, v[12:13]
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v13, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v19, v22, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v1, v22, v11
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v21, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v20, v18, v[12:13]
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v17, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v18, v12
+; GISEL-NEXT:    v_mul_lo_u32 v8, v18, v14
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v18, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[6:7]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v22, v11
 ; GISEL-NEXT:    v_add_i32_e64 v1, s[6:7], v1, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v15, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v21, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v17, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v15, v21, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v21, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v13, v15, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v16, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v8
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v13, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v17, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v21, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v15, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v16, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v15, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v13, v17, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v22, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_hi_u32 v13, v18, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; GISEL-NEXT:    v_mul_hi_u32 v14, v22, v14
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v11
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v16, v12, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v22, v12, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v19, v13, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v19, v15, v[0:1]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v14, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[0:1]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v12
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v12, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v12
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v14, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v12, v1, v7
+; GISEL-NEXT:    v_ashrrev_i32_e32 v15, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v13, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v15
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v15, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v5, v1, v15
+; GISEL-NEXT:    v_mul_lo_u32 v1, v14, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v16, v2, v15
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v11
-; GISEL-NEXT:    v_mul_hi_u32 v4, v15, v11
+; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v11
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v2, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -2042,25 +2043,25 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v15, v0, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v0, vcc, v14, v0, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v16, v1
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v1
+; GISEL-NEXT:    v_mul_lo_u32 v3, v5, v0
+; GISEL-NEXT:    v_mul_hi_u32 v4, v5, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v1, v16, v1
-; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v7
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_mul_hi_u32 v3, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v3, v5, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
@@ -2074,39 +2075,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v13, v[0:1]
-; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v7
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v5, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v12, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v14, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v16, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v16, v3
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v9, v11, v[3:4]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v16, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v16, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v4, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v10
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v9
-; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v7, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v9
+; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v9
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v7, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v4, v12, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v4, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v4, v15, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -2138,126 +2138,126 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v10, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v11, v1
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v17, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v18, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v10, 0x4f800000, v11
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v10, v10
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
 ; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v10
 ; CGP-NEXT:    v_trunc_f32_e32 v12, v11
 ; CGP-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT:    v_mul_hi_u32 v17, v13, v10
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_mul_lo_u32 v12, v16, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v18, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v19, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; CGP-NEXT:    v_mul_hi_u32 v17, v13, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v18, v12
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v19, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v19, v12
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT:    v_mul_lo_u32 v11, v19, v10
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT:    v_mul_hi_u32 v12, v16, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT:    v_mul_lo_u32 v13, v16, v14
+; CGP-NEXT:    v_mul_lo_u32 v15, v19, v14
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v16, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v10
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v11, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
-; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v8, v14
-; CGP-NEXT:    v_mul_lo_u32 v8, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v15, v13, v11
-; CGP-NEXT:    v_xor_b32_e32 v17, v9, v14
-; CGP-NEXT:    v_mul_hi_u32 v9, v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v13, v19, v14
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v10
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, v19, v11, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v16, 0
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v17, v19, v[11:12]
+; CGP-NEXT:    v_ashrrev_i32_e32 v17, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v18, v16, v[12:13]
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v17, vcc
+; CGP-NEXT:    v_xor_b32_e32 v15, v8, v17
+; CGP-NEXT:    v_mul_lo_u32 v8, v19, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v16, v14
+; CGP-NEXT:    v_xor_b32_e32 v18, v9, v17
+; CGP-NEXT:    v_mul_hi_u32 v9, v16, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v19, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v9, v19, v14
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v14
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v19, v14
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v16, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v17, v8
-; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, v12, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v17, v8
-; CGP-NEXT:    v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v19, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v18, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v15, v9
+; CGP-NEXT:    v_mul_hi_u32 v12, v15, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v18, v8
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v18, v9
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v8, v10
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v8, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v18, v9
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v14, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v10
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
-; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v17, v9
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v12, v10
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v4, v16, v[9:10]
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v1, v14, v[10:11]
+; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v18, v12, vcc
+; CGP-NEXT:    v_sub_i32_e64 v10, s[4:5], v18, v12
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
+; CGP-NEXT:    v_subb_u32_e32 v10, vcc, v10, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v10, v12, v15, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v11, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v9, v11, v12, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v14
+; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v16, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v16, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v12
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v15, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v11
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v12, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v15, v8, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v8, v14, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v12, v8, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v8, v17, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v1, v8
 ; CGP-NEXT:    v_xor_b32_e32 v1, v4, v8
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
@@ -2313,128 +2313,128 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v6, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v4
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v4
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v8
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
 ; CGP-NEXT:    v_trunc_f32_e32 v10, v8
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v10
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
 ; CGP-NEXT:    v_mov_b32_e32 v6, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT:    v_mul_lo_u32 v6, v14, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v14, v8
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v9
-; CGP-NEXT:    v_mul_lo_u32 v16, v14, v9
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT:    v_mul_lo_u32 v6, v16, v8
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v16, v8
+; CGP-NEXT:    v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v6
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, v14, v8, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v6
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v8, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
 ; CGP-NEXT:    v_mov_b32_e32 v6, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v14, v[6:7]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v10, v5, v12
-; CGP-NEXT:    v_mul_lo_u32 v5, v14, v8
-; CGP-NEXT:    v_mul_lo_u32 v7, v11, v9
-; CGP-NEXT:    v_xor_b32_e32 v13, v6, v12
-; CGP-NEXT:    v_mul_hi_u32 v6, v11, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v14, v8
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v16, v[6:7]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT:    v_mul_lo_u32 v5, v16, v8
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT:    v_xor_b32_e32 v15, v6, v14
+; CGP-NEXT:    v_mul_hi_u32 v6, v13, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v16, v8
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v14, v9
+; CGP-NEXT:    v_mul_lo_u32 v6, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v11, v9
+; CGP-NEXT:    v_mul_hi_u32 v7, v13, v11
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_mul_hi_u32 v8, v14, v9
+; CGP-NEXT:    v_mul_hi_u32 v8, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v14, v6, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v5
-; CGP-NEXT:    v_mul_lo_u32 v8, v10, v6
-; CGP-NEXT:    v_mul_hi_u32 v9, v10, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v13, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v16, v6, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v15, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v12, v6
+; CGP-NEXT:    v_mul_hi_u32 v9, v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v15, v5
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v13, v6
+; CGP-NEXT:    v_mul_lo_u32 v9, v15, v6
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_mul_hi_u32 v8, v10, v6
+; CGP-NEXT:    v_mul_hi_u32 v8, v12, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v5, v7
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, 0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v15, v6
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v11, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v7
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v8, v[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v9, v[6:7]
-; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v13, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v9, v7
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v13, v[6:7]
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v11, v[7:8]
+; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v15, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v7, s[4:5], v15, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v4
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v3
-; CGP-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v8, v9, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v11
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v13, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v13, v4, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v10
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v11, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v12, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v5, v14, v2
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v2, v3, v5
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v5
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
@@ -2504,15 +2504,15 @@ define i64 @v_sdiv_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v1
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v5, v0
@@ -2537,198 +2537,198 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v9, 0
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v9
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v10, 0
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GISEL-NEXT:    v_trunc_f32_e32 v5, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v3
+; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v3
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v11, v[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v3
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v9, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v13, v3, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v3
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
-; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v2
+; GISEL-NEXT:    v_and_b32_e32 v13, 0xffffff, v2
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v3
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
 ; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v3
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v1, v0, 0
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v3
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v4, v[5:6]
-; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v4, v[5:6]
+; GISEL-NEXT:    v_mac_f32_e32 v12, 0x4f800000, v10
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v2, v12
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v0, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v11, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v2
 ; GISEL-NEXT:    v_trunc_f32_e32 v8, v6
 ; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v13, s[4:5], 0, v3
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v2
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v15, s[4:5], 0, v3
+; GISEL-NEXT:    v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v8
 ; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v15, v[2:3]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v15, v17, v[2:3]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v14, v12, v[7:8]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, -1, v2, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v2, v15, v6
-; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v7
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v6
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v14, v[7:8]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v2, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v2, v17, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v9
+; GISEL-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v15, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v15, v6
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v7
+; GISEL-NEXT:    v_mul_lo_u32 v5, v17, v9
+; GISEL-NEXT:    v_mul_hi_u32 v6, v17, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v14, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v7, v15, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v9
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v2
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v10, 0
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v2
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v17, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v15, v12, 0
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v11, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, v[2:3]
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v16, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v14, v10, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[2:3]
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[6:7]
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v1
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v5
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v8
+; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v6
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v12, v6, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v14, v6, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v2
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_mul_lo_u32 v9, 0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v2
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v5, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v12, 0
+; GISEL-NEXT:    v_mul_hi_u32 v14, 0, v7
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v10, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v10, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v9, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v10, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v3, v14, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v11, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v12, v[6:7]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v12, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v9, v[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v5
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v6
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v13, v5
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v8
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v8, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
@@ -2736,8 +2736,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v12
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
@@ -2748,8 +2748,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v14, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_24bit:
@@ -2769,27 +2769,27 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v1
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v7, v1, v7, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_and_b32_e32 v8, 0xffffff, v2
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v5, v3
-; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v6, v1
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v8, v1, 0
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v2
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v8, v3
+; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v1
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v6, 0
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v7
-; CGP-NEXT:    v_mul_lo_u32 v5, v2, v4
+; CGP-NEXT:    v_mul_lo_u32 v6, v2, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v7, v1, vcc
 ; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v8, v5
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v5, v6
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 1441591a5fcce..f4489c2239fda 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -175,65 +175,65 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v6, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s15, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s14, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s14, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s15, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -246,36 +246,36 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s10, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
-; GFX8-NEXT:    v_subb_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v1, s[0:1], s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_subb_u32_e64 v2, s[0:1], v8, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v3, s[0:1], s11, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v2
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s8, v0
-; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v0
+; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v3, vcc
+; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v5
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s8, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s8, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
 ; GFX8-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
@@ -283,20 +283,20 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v0, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v6, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v7, v10, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v0, v3, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v4
-; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v3
-; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v6
+; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v5
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s2, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s2, v3
@@ -312,6 +312,7 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-LABEL: sdivrem_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[12:19], s[8:9], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s2, s17, 31
 ; GFX9-NEXT:    s_ashr_i32 s4, s19, 31
@@ -335,64 +336,63 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v3, 0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s7
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s9, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s9, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -400,67 +400,67 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s9, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s8, v1
+; GFX9-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s6, v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s6, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s9
+; GFX9-NEXT:    v_add3_u32 v8, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s6, v8, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s8, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s7, v5, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s9, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s7, v6, v[2:3]
+; GFX9-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s9, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v8, vcc, s6, v0
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v0
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v8, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s6, v8
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v5
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s6, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v10, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v0, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v3
+; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v3
+; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v6
-; GFX9-NEXT:    v_xor_b32_e32 v5, s2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s2, v4
+; GFX9-NEXT:    v_xor_b32_e32 v4, s2, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
-; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[12:13]
-; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[14:15]
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v5, vcc
+; GFX9-NEXT:    global_store_dwordx2 v9, v[0:1], s[12:13]
+; GFX9-NEXT:    global_store_dwordx2 v9, v[2:3], s[14:15]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: sdivrem_i64:
@@ -1311,68 +1311,68 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_ashr_i32 s6, s19, 31
 ; GFX8-NEXT:    s_mov_b32 s7, s6
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -1385,38 +1385,38 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v4, s11, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s10, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v5, s9
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, s10, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX8-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s11, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_subb_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s11, v4
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s8, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
-; GFX8-NEXT:    v_subbrev_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s8, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX8-NEXT:    v_subbrev_u32_e64 v10, s[0:1], 0, v0, vcc
+; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v5, s[0:1], 0, v7, s[0:1]
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v9
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v1
-; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v3
+; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v5, s[0:1]
 ; GFX8-NEXT:    s_add_u32 s0, s18, s6
 ; GFX8-NEXT:    s_addc_u32 s1, s19, s6
 ; GFX8-NEXT:    s_add_u32 s2, s2, s10
@@ -1424,15 +1424,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v14, s3
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s8, v8
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX8-NEXT:    v_subrev_u32_e32 v15, vcc, s8, v4
 ; GFX8-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v0, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v14
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v12, vcc
 ; GFX8-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1441,151 +1441,151 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v0
 ; GFX8-NEXT:    s_sub_u32 s5, 0, s2
-; GFX8-NEXT:    s_subb_u32 s20, 0, s3
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v13, vcc
 ; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v3, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v15, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s5, v5, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v12, v[1:2]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v16, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v11, v[1:2]
+; GFX8-NEXT:    s_subb_u32 s20, 0, s3
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v15, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[18:19], s20, v12, v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v1, s[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v11, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v12, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v2, s[0:1]
 ; GFX8-NEXT:    v_mul_hi_u32 v2, v12, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v5, v0
-; GFX8-NEXT:    v_xor_b32_e32 v9, s17, v10
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v11, v0
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
-; GFX8-NEXT:    v_mul_hi_u32 v1, v5, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v12, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v1, s16, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, v11, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_mul_hi_u32 v3, v12, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v2, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, v11, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v12, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v10, 0
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, v11, v1, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, s16, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s17
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX8-NEXT:    v_xor_b32_e32 v5, s17, v7
+; GFX8-NEXT:    v_mov_b32_e32 v6, s17
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s16, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
-; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v9, v10, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v7
-; GFX8-NEXT:    v_mul_lo_u32 v7, v5, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, v8, v3
-; GFX8-NEXT:    v_mul_hi_u32 v11, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v5, v2
-; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
+; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v5, v6, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX8-NEXT:    v_mul_lo_u32 v4, v11, v2
+; GFX8-NEXT:    v_xor_b32_e32 v3, s4, v9
+; GFX8-NEXT:    v_mul_lo_u32 v7, v10, v5
+; GFX8-NEXT:    v_mul_hi_u32 v9, v10, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, v11, v2
+; GFX8-NEXT:    v_xor_b32_e32 v6, s4, v8
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v11, v5, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_mul_hi_u32 v9, v8, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v11, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v9
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, v11, v5
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v7, v10, v5
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
-; GFX8-NEXT:    v_mul_hi_u32 v3, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v10, s4
-; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, s8, v3
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v10, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v6, s8, v2
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_mul_hi_u32 v5, v11, v5
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v11, v4, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, s9, v2
+; GFX8-NEXT:    v_mul_lo_u32 v10, s8, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, s8, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s4
+; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v8, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v9, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s9, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v6, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, s9, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v2, s9, v2
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    v_mul_hi_u32 v8, s8, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT:    v_mul_hi_u32 v8, s8, v7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v2, v6
-; GFX8-NEXT:    v_mul_hi_u32 v9, s9, v3
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v8, 0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
-; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v9, v[3:4]
-; GFX8-NEXT:    v_mov_b32_e32 v10, s9
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v2, v3
+; GFX8-NEXT:    v_mul_hi_u32 v7, s9, v7
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v7, v6
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s9
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s8, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v8, v[6:7]
+; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v10, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s9, v6
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[0:1]
+; GFX8-NEXT:    v_subb_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v7, s[0:1], s9, v8
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v11, vcc, s2, v2
-; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, s2, v2
+; GFX8-NEXT:    v_subbrev_u32_e64 v12, s[0:1], 0, v7, vcc
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
 ; GFX8-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v8
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
-; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v9, s[0:1]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 1, v14
+; GFX8-NEXT:    v_add_u32_e64 v14, s[0:1], 1, v10
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GFX8-NEXT:    v_addc_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 1, v14
 ; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v11
+; GFX8-NEXT:    v_subrev_u32_e64 v13, s[0:1], s2, v9
 ; GFX8-NEXT:    v_subbrev_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, v9, v14, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, v11, v14, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v6
+; GFX8-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v6, s1
+; GFX8-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s0, v2
-; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v6, vcc
-; GFX8-NEXT:    v_xor_b32_e32 v6, s6, v9
-; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v7
-; GFX8-NEXT:    v_mov_b32_e32 v8, s6
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v6
-; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v7, v8, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v7, s6, v9
+; GFX8-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX8-NEXT:    v_mov_b32_e32 v9, s6
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s6, v7
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s12
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s13
 ; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
@@ -1622,66 +1622,67 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s16, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s17, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s16, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s11
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s16, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s17, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX9-NEXT:    s_xor_b64 s[16:17], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_ashr_i32 s6, s19, 31
 ; GFX9-NEXT:    s_mov_b32 s7, s6
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s11, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1693,51 +1694,50 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v3, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s10, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v5, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v7, v[1:2]
+; GFX9-NEXT:    v_sub_co_u32_e32 v9, vcc, s10, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-NEXT:    s_ashr_i32 s10, s3, 31
-; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v6, v1, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v0, s11, v1
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v8, v4, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v0, s11, v4
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v2, s[0:1]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s8, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v2, v3, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[0:1], 0, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v1, v12, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v2
+; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v3, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s18, s6
 ; GFX9-NEXT:    s_addc_u32 s1, s19, s6
 ; GFX9-NEXT:    s_add_u32 s2, s2, s10
 ; GFX9-NEXT:    s_mov_b32 s11, s10
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[10:11]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s2
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v14
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v15
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s8, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v14, vcc, s8, v10
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -1747,31 +1747,31 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v0
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
 ; GFX9-NEXT:    s_sub_u32 s5, 0, s2
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v17, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v2, v13, vcc
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v2, v12, vcc
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v3, v13, vcc
 ; GFX9-NEXT:    s_subb_u32 s20, 0, s3
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v13, v[1:2]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v3, v11, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[18:19], s20, v17, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v10, v15, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v17, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v12, v[1:2]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v6, v5, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[18:19], s20, v17, v[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v2, v12, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v10, v14, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v17, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v10, v17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v13, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v15, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v13, v1
+; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v4
+; GFX9-NEXT:    v_mul_hi_u32 v0, v12, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX9-NEXT:    v_mul_hi_u32 v3, v17, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT:    v_mul_hi_u32 v3, v17, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v12, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
@@ -1779,119 +1779,119 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX9-NEXT:    v_add_u32_e32 v3, v10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v17, v0
-; GFX9-NEXT:    v_add3_u32 v1, v3, v2, v1
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v12, v2, vcc
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[18:19], s5, v10, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v13, v1, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v1, s16, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v11, v[0:1]
-; GFX9-NEXT:    v_xor_b32_e32 v5, s16, v5
-; GFX9-NEXT:    v_xor_b32_e32 v8, s17, v8
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v10, v[0:1]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s17
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v5
-; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v5, v11, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, v10, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v11, v[0:1]
+; GFX9-NEXT:    v_xor_b32_e32 v8, s17, v5
+; GFX9-NEXT:    v_mov_b32_e32 v12, s17
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s20, v10, v[3:4]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, v11, v2
+; GFX9-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v12, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v10, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v11, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, v10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v11, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, v6, v3
+; GFX9-NEXT:    v_mul_hi_u32 v6, v10, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v7, v5, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX9-NEXT:    v_add_u32_e32 v6, v8, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v5
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v10, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v11, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v2
-; GFX9-NEXT:    v_mul_lo_u32 v7, s8, v3
+; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v3
+; GFX9-NEXT:    v_xor_b32_e32 v4, s4, v9
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s8, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s9, v2
-; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s9, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v6, s8, v3
+; GFX9-NEXT:    v_mul_hi_u32 v12, s9, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v9, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v2, v5
 ; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v10, 0
-; GFX9-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT:    v_xor_b32_e32 v7, s4, v7
+; GFX9-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s4, v4
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v8, vcc
-; GFX9-NEXT:    v_add_u32_e32 v6, v9, v7
-; GFX9-NEXT:    v_add3_u32 v8, v6, v11, v12
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v8, v[3:4]
-; GFX9-NEXT:    v_mov_b32_e32 v9, s9
+; GFX9-NEXT:    v_add3_u32 v11, v6, v11, v12
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v7, v8, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v11, v[3:4]
+; GFX9-NEXT:    v_mov_b32_e32 v12, s9
 ; GFX9-NEXT:    v_sub_co_u32_e32 v2, vcc, s8, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v10, v[6:7]
+; GFX9-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], s3, v10, v[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; GFX9-NEXT:    v_subb_co_u32_e64 v7, s[0:1], v9, v6, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT:    v_sub_u32_e32 v6, s9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v12, v8, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT:    v_sub_u32_e32 v7, s9, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v11, vcc, s2, v2
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s2, v2
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v12, s[0:1], 0, v7, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v11
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v12
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v10
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v14
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v14
 ; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v14, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v11
+; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s2, v9
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v11, v15, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v15, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, v8, v14, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v11, v14, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v9, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v3, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[10:11]
-; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v6
+; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v8
-; GFX9-NEXT:    v_mov_b32_e32 v6, s1
+; GFX9-NEXT:    v_mov_b32_e32 v7, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v6, s6, v9
+; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v9
 ; GFX9-NEXT:    v_mov_b32_e32 v13, 0
-; GFX9-NEXT:    v_xor_b32_e32 v7, s6, v7
-; GFX9-NEXT:    v_mov_b32_e32 v8, s6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v6
-; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v8, vcc
+; GFX9-NEXT:    v_xor_b32_e32 v8, s6, v6
+; GFX9-NEXT:    v_mov_b32_e32 v9, s6
+; GFX9-NEXT:    v_subrev_co_u32_e32 v6, vcc, s6, v7
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v9, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v13, v[0:3], s[12:13]
 ; GFX9-NEXT:    global_store_dwordx4 v13, v[4:7], s[14:15]
 ; GFX9-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 40b5db0a15447..6f42239cd191d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -31,128 +31,128 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT:    v_subb_u32_e32 v12, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v6, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v6
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v13, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v7, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v5, v3, v9
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v2
-; CHECK-NEXT:    v_mul_lo_u32 v7, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v10, v4, v9
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v11, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT:    v_addc_u32_e32 v13, vcc, v13, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v11, 31, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v11
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v11, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v9, v3, v11
+; CHECK-NEXT:    v_mul_lo_u32 v3, v13, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v10, v8
+; CHECK-NEXT:    v_xor_b32_e32 v12, v4, v11
+; CHECK-NEXT:    v_mul_hi_u32 v4, v10, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v13, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v4, v13, v8
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v12, v2
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_mul_lo_u32 v6, v12, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v9, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v2, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v7, 0
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v7, v[3:4]
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v11, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -214,65 +214,65 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v1
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v6, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v6, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v7, v[1:2]
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v6, v[2:3]
+; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v7, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s3, v3, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v4, v[1:2]
-; CHECK-NEXT:    v_mul_hi_u32 v6, v3, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v3, v[1:2]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v1
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s11, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s11, v1
+; CHECK-NEXT:    v_mov_b32_e32 v7, s11
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -285,19 +285,19 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v0, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v4, 0
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v4, s11, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v6, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s8, v2, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v5, s11
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s8, v4, v[1:2]
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s9, v4, v[1:2]
-; CHECK-NEXT:    v_mov_b32_e32 v3, s9
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v5, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[0:1], s11, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s9, v6, v[2:3]
+; CHECK-NEXT:    v_sub_i32_e64 v3, s[0:1], s11, v4
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[0:1], v7, v4, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v2
 ; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
@@ -372,84 +372,84 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v9, v8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, 0, v5
-; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v5
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v8, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v11, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v15, v9
-; GISEL-NEXT:    v_mul_hi_u32 v16, v12, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v16, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v12
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v4
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v15, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v12, 0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v4
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v9, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v14, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v17, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[10:11]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v13, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v12, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v12
+; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v17, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v12
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v17, v12
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v15, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v15, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v14, v1
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v1
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
@@ -457,148 +457,148 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v1, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v1, v0
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v10
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v1, v0
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v9
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v9
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v15, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT:    v_mac_f32_e32 v12, 0x4f800000, v15
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v9, v[1:2]
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v16, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
+; GISEL-NEXT:    v_mac_f32_e32 v12, 0x4f800000, v16
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v11, v[1:2]
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v12
-; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v8, v11, v[9:10]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v14, v[9:10]
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v12, v10
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v1
+; GISEL-NEXT:    v_trunc_f32_e32 v12, v9
 ; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v12
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v1
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v13, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v12, v[0:1]
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v14, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v15, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v14, v9
-; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v10
-; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v0
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v11, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v14, 0
+; GISEL-NEXT:    v_sub_i32_e32 v19, vcc, v13, v0
+; GISEL-NEXT:    v_mov_b32_e32 v0, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v18, v[0:1]
+; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], v15, v11, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v14, v[12:13]
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[4:5], v15, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v18, v9
+; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v0
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v10, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v11, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v14, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v13, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v19, s[6:7], 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v14, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v19, v8
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v9, v18, v9
+; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v19, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v10, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v19, v5
+; GISEL-NEXT:    v_subbrev_u32_e64 v15, s[6:7], 0, v1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v12, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v15, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v8
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v14, v5
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v15, v8
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v12, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v19, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v18, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v1
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v10, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v16, v12, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v1
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v18, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v12, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v10, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v19, v5, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v10, v[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v12, v[8:9]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v13, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v8
-; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v5
+; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v12, v10
+; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; GISEL-NEXT:    v_mul_hi_u32 v8, v12, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v2
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT:    v_xor_b32_e32 v10, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_xor_b32_e32 v10, v14, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v2
+; GISEL-NEXT:    v_mul_lo_u32 v9, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v12, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v9, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v8, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v10, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v11, v[8:9]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v13, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v12, v[8:9]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v15, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -651,128 +651,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v0
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, 0, v0
+; CGP-NEXT:    v_subb_u32_e32 v16, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CGP-NEXT:    v_trunc_f32_e32 v4, v3
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT:    v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT:    v_mul_lo_u32 v16, v5, v3
-; CGP-NEXT:    v_mul_lo_u32 v17, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v17, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v2
+; CGP-NEXT:    v_cvt_u32_f32_e32 v17, v4
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT:    v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, v14, v3, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v14, v[3:4]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v5, v[3:4]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v12
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v11, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v4, v12
-; CGP-NEXT:    v_mul_lo_u32 v4, v14, v2
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v3
-; CGP-NEXT:    v_mul_hi_u32 v2, v14, v2
-; CGP-NEXT:    v_xor_b32_e32 v10, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v14, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v15, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v2
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v17, v3, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v15, v14, 0
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v15, v17, v[3:4]
+; CGP-NEXT:    v_ashrrev_i32_e32 v15, 31, v11
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v15
+; CGP-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[4:5]
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v15, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v3, v15
+; CGP-NEXT:    v_mul_lo_u32 v3, v17, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v14, v12
+; CGP-NEXT:    v_xor_b32_e32 v16, v4, v15
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v17, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v14, v3, vcc
-; CGP-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, v11, v3
-; CGP-NEXT:    v_mul_hi_u32 v13, v11, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_mul_hi_u32 v5, v14, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v14, v2
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v17, v3, vcc
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v2
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
+; CGP-NEXT:    v_mul_hi_u32 v10, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v16, v2
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v10, v16, v3
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v5, v11, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v5, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v2, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v13, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v3
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v13, v[3:4]
-; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v4
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v10, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v13, v2
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v1, v12, v[4:5]
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v16, v10, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v16, v10
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v2, v0
-; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v3, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v1
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v12
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v12
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v15
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v15
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v15, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  .LBB2_2: ; %Flow1
@@ -820,128 +820,128 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v11, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v5
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v5, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
+; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v6
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT:    v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v15, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v4
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v12, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v7, v[5:6]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v10
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v4
-; CGP-NEXT:    v_mul_lo_u32 v11, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v4, v12, v4
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v12, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_mul_hi_u32 v5, v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v4
+; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v5, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v12, 0
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v15, v[5:6]
+; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v13
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[6:7]
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v5, v13
+; CGP-NEXT:    v_mul_lo_u32 v5, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v12, v10
+; CGP-NEXT:    v_xor_b32_e32 v14, v6, v13
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v12, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v6, v15, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v7, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v15, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v15, v5, vcc
+; CGP-NEXT:    v_mul_lo_u32 v6, v14, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v11, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v11, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v14, v4
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v14, v5
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v7, v9, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v7, v11, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v4, v6
+; CGP-NEXT:    v_mul_hi_u32 v8, v14, v5
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v11, v[5:6]
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v8, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v10, v[6:7]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v14, v8, vcc
+; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v14, v8
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v13
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v13
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v13, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -977,82 +977,82 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xfffff000
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xfffff000
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x1000
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1060,39 +1060,39 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 4096
   ret i64 %result
@@ -1141,92 +1141,92 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v7, v18, v16
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v7, v18, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT:    v_mul_hi_u32 v7, v19, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v7, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT:    v_xor_b32_e32 v20, v1, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v20, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v15, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v20, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x1000
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT:    v_mul_lo_u32 v15, v20, v1
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT:    v_mul_hi_u32 v14, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v20, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v17, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v20, v15
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v20, v15, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v16, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1243,74 +1243,74 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v17, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v16, v14, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v1, v12, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v9, v2, v12
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v13, v7
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v14, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v14, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1330,10 +1330,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1352,110 +1352,110 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
 ; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
 ; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v9, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT:    v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
 ; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT:    v_xor_b32_e32 v17, v1, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v17, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x1000
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT:    v_mul_lo_u32 v16, v17, v1
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT:    v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT:    v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
@@ -1473,72 +1473,72 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT:    v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v13, v9
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1558,10 +1558,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -1573,82 +1573,82 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v9, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_mul_lo_u32 v3, v10, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_mul_hi_u32 v5, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
+; CHECK-NEXT:    v_addc_u32_e32 v10, vcc, v10, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v8, v[4:5]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v9
+; CHECK-NEXT:    v_mul_lo_u32 v0, v10, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v8, v6
+; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v9
+; CHECK-NEXT:    v_mul_hi_u32 v1, v8, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v1, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v10, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v5, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v1
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
@@ -1656,39 +1656,39 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v1
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v7
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v7, v[1:2]
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v5
-; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT:    v_subb_u32_e64 v1, s[4:5], v5, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v2, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, -1, v3, s[4:5]
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 1235195
   ret i64 %result
@@ -1737,92 +1737,92 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v8, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v17, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], s6, v16, v[14:15]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v19, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], s6, v18, v[14:15]
 ; GISEL-NEXT:    s_mov_b32 s6, 1
 ; GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v16, v14
+; GISEL-NEXT:    v_mul_lo_u32 v7, v18, v16
 ; GISEL-NEXT:    s_subb_u32 s6, 0, 0
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v7, v18, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v14
+; GISEL-NEXT:    v_mul_hi_u32 v7, v19, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v16
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v7, v15
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v7, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v18, v0, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v17, v14
-; GISEL-NEXT:    v_xor_b32_e32 v19, v1, v7
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v19, v16
+; GISEL-NEXT:    v_xor_b32_e32 v20, v1, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v19, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v18, v1
-; GISEL-NEXT:    v_mul_hi_u32 v15, v18, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v19, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v20, v0
+; GISEL-NEXT:    v_mul_lo_u32 v14, v17, v1
+; GISEL-NEXT:    v_mul_hi_u32 v15, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v20, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v19, v1
+; GISEL-NEXT:    v_mul_lo_u32 v15, v20, v1
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v14, v18, v1
+; GISEL-NEXT:    v_mul_hi_u32 v14, v17, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v15, 0
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v15, v20, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v18, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], 0, v15, v[13:14]
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v15, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], 0, v18, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v17, v0
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v20, v15
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], v20, v15, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; GISEL-NEXT:    v_sub_i32_e32 v18, vcc, v16, v4
+; GISEL-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v16, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v18, -1, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT:    v_cndmask_b32_e32 v20, -1, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v17, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v5, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, -1, v1, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, v[13:14]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v18, v4
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v0
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v19, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v18, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v19, v5, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1839,74 +1839,74 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v15, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v14, v16, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v1, v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v2, v9, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v5
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v9, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v17, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v11, v[1:2]
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v16, v14, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v1, v12, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[8:9]
+; GISEL-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v9, v2, v12
+; GISEL-NEXT:    v_mul_lo_u32 v2, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
+; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v11, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_xor_b32_e32 v8, v13, v7
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v6, v14, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v9, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v11, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[5:6]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v14, v7
+; GISEL-NEXT:    v_subb_u32_e64 v3, s[4:5], v14, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -1926,10 +1926,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v12
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v12
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -1948,110 +1948,110 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, 0
 ; CGP-NEXT:    v_mov_b32_e32 v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v8, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v8, v[9:10]
 ; CGP-NEXT:    v_mul_hi_u32 v12, v8, v4
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], -1, v7, v[9:10]
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v7, v[10:11]
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v13
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v7, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v7, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v18, 0
+; CGP-NEXT:    v_addc_u32_e32 v19, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v4, v14
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v17, v13
-; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT:    v_mul_lo_u32 v9, v16, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v19, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v19, v13
+; CGP-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], -1, v18, v[14:15]
+; CGP-NEXT:    v_mul_lo_u32 v9, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v16, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v18, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_hi_u32 v9, v17, v13
-; CGP-NEXT:    v_mul_lo_u32 v13, v17, v14
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v16, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v19, v13
+; CGP-NEXT:    v_mul_lo_u32 v13, v19, v16
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v18, v16
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v9, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v9
 ; CGP-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v9, vcc
-; CGP-NEXT:    v_xor_b32_e32 v18, v0, v9
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v17, v14
-; CGP-NEXT:    v_xor_b32_e32 v19, v1, v9
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v9
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v19, v16
+; CGP-NEXT:    v_xor_b32_e32 v17, v1, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v13, v19, v0
-; CGP-NEXT:    v_mul_lo_u32 v14, v18, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v18, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v19, v0
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v18, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v19, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v17, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v19, v1
+; CGP-NEXT:    v_mul_lo_u32 v16, v17, v1
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v14, v18, v1
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_mul_hi_u32 v15, v19, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, v18, v0
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v19, v13
-; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v19, v13, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v18, v17, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v16, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v15, v0
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v17, v13
+; CGP-NEXT:    v_subb_u32_e64 v16, s[4:5], v17, v13, vcc
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
-; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT:    v_sub_i32_e32 v18, vcc, v15, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v19, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v16
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v17, -1, v1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, v[0:1]
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[0:1]
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
-; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v18, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v8, v[0:1]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v19
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v7, v[13:14]
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v18, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v19, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_mul_lo_u32 v5, v7, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v16, v16, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v14, v18, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v19, v13, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
@@ -2069,72 +2069,72 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v1
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v7, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v16, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v5, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v8, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v10, v15, v17, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v10, v9
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT:    v_mul_lo_u32 v2, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v6, v7, v5
-; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v3, v7, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v7, v1
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v8, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v14, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v6, v11, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v12, v16, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v13, v5, v9
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v10, v[7:8]
+; CGP-NEXT:    v_xor_b32_e32 v1, v12, v9
+; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_xor_b32_e32 v7, v2, v12
+; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v10, v5
+; CGP-NEXT:    v_xor_b32_e32 v8, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v8, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v6, v7, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v10, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v11, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v11, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT:    v_mul_lo_u32 v6, v7, v2
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v13, v9
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v8, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_mul_hi_u32 v6, v7, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v7, v13, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v9, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v8, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v8, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -2154,10 +2154,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -2193,130 +2193,130 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_xor_b32_e32 v1, v2, v1
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v1
-; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, 0, v0
-; CHECK-NEXT:    v_subb_u32_e32 v10, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v11, vcc, 0, v0
+; CHECK-NEXT:    v_subb_u32_e32 v12, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v5
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v7, v5
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v7
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v8, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v11, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v13, v7
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT:    v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v6
-; CHECK-NEXT:    v_mul_lo_u32 v13, v11, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT:    v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, v10, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v8, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v10, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
-; CHECK-NEXT:    v_mul_hi_u32 v6, v11, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_mul_hi_u32 v7, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v11, vcc, v11, v5, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v9, v8, 0
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
+; CHECK-NEXT:    v_addc_u32_e32 v13, vcc, v13, v5, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v11, v10, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v6
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v11, v[2:3]
-; CHECK-NEXT:    v_ashrrev_i32_e32 v9, 31, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v9
-; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v10, v8, v[6:7]
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v4, v9, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v7, v2, v9
-; CHECK-NEXT:    v_mul_lo_u32 v2, v11, v5
-; CHECK-NEXT:    v_mul_lo_u32 v4, v8, v6
-; CHECK-NEXT:    v_xor_b32_e32 v10, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v3, v8, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v5
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v11, v13, v[2:3]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v11, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v11
+; CHECK-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[6:7]
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v4, v11, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v9, v2, v11
+; CHECK-NEXT:    v_mul_lo_u32 v2, v13, v5
+; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v8
+; CHECK-NEXT:    v_xor_b32_e32 v12, v3, v11
+; CHECK-NEXT:    v_mul_hi_u32 v3, v10, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v3, v11, v6
+; CHECK-NEXT:    v_mul_lo_u32 v3, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; CHECK-NEXT:    v_mul_hi_u32 v4, v8, v6
+; CHECK-NEXT:    v_mul_hi_u32 v4, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v5, v11, v6
+; CHECK-NEXT:    v_mul_hi_u32 v5, v13, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v11, v3, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, v10, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, v7, v3
-; CHECK-NEXT:    v_mul_hi_u32 v6, v7, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v10, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v10, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT:    v_addc_u32_e32 v3, vcc, v13, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v4, v12, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v9, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v12, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v6, v10, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v12, v3
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v5, v7, v3
+; CHECK-NEXT:    v_mul_hi_u32 v5, v9, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v6, 0
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v2, v4
+; CHECK-NEXT:    v_mul_hi_u32 v6, v12, v3
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v8, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v6, v[3:4]
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v0, v6, v[3:4]
+; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v1, v8, v[4:5]
+; CHECK-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v4, s[4:5], v12, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CHECK-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v1
+; CHECK-NEXT:    v_subb_u32_e32 v4, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
 ; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v2, v0
-; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v3, vcc
+; CHECK-NEXT:    v_subbrev_u32_e64 v7, s[4:5], 0, v4, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v4, v1, vcc
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v9
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v9
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v9, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v11
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v11
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v11, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -2361,85 +2361,85 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v10, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v5
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v7
-; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v5
-; GISEL-NEXT:    v_subb_u32_e32 v15, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v5
+; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v12, v10
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v12
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v10
-; GISEL-NEXT:    v_mul_hi_u32 v17, v13, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v18, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v17, v16, v11
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v12, v18, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v18, v13
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
-; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v12, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v16, v10
-; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v11
-; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v0, v18, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v18, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v14, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v1, v18, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v13
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v18, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v16, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v0
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v18, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v16, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v11
 ; GISEL-NEXT:    v_lshl_b64 v[0:1], v[8:9], v6
 ; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v15, v11
+; GISEL-NEXT:    v_mul_lo_u32 v8, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v11
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
@@ -2448,127 +2448,127 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v8, v6
-; GISEL-NEXT:    v_mul_hi_u32 v8, v15, v11
+; GISEL-NEXT:    v_mul_hi_u32 v8, v16, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v8, v6
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v13, 0
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v0, v8
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v1, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v14, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v16, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v6
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v15, v8
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, v[0:1]
-; GISEL-NEXT:    v_mac_f32_e32 v14, 0x4f800000, v16
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v10, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v16, vcc, 0, v6
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v10
+; GISEL-NEXT:    v_sub_i32_e32 v17, vcc, 0, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v5, v14, v[0:1]
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v15
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v14, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v13, v[10:11]
+; GISEL-NEXT:    v_subb_u32_e32 v18, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v14
 ; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v1
 ; GISEL-NEXT:    v_trunc_f32_e32 v13, v10
 ; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v13
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT:    v_subb_u32_e32 v17, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v19, v13
 ; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v17, v15, 0
+; GISEL-NEXT:    v_subb_u32_e64 v20, s[4:5], v16, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v13, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v10
-; GISEL-NEXT:    v_subb_u32_e64 v18, s[4:5], v15, v0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v14, v[11:12]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v15, v0
-; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v11
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v18, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v19, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v1, v19, v10
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v16, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v18, v15, v[11:12]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v20, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
+; GISEL-NEXT:    v_mul_lo_u32 v11, v15, v13
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v15, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[6:7]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v9, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v18, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v1, v12, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v20, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, v11, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v9, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v19, s[6:7], 0, v0, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v14, s[6:7], 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v19, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[8:9], v14, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v0, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[8:9]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[8:9]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v19, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v14, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, v16, v21, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v19, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v14, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT:    v_mul_hi_u32 v15, v14, v11
+; GISEL-NEXT:    v_mul_hi_u32 v1, v19, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v19, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v13
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v19, v13
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v16, v11, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v0
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v19, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v13, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v14, v[1:2]
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v5, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v11, v[9:10]
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v18, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v18, v13, v[9:10]
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v20, v7, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v5
-; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v14, v3, v5
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v11
+; GISEL-NEXT:    v_xor_b32_e32 v15, v3, v5
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v11
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT:    v_mul_hi_u32 v9, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v2
+; GISEL-NEXT:    v_mul_lo_u32 v10, v15, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
@@ -2577,19 +2577,19 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v2
+; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v10, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v7, v4, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v14, v3
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v15, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v15, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -2645,103 +2645,103 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v1, v4, v1
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v1
-; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v0
-; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, 0, v0
+; CGP-NEXT:    v_subb_u32_e32 v17, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v12, v10
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v18, v12
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
 ; CGP-NEXT:    v_mov_b32_e32 v4, v11
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v17, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v18, v16, v11
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v18, v10
+; CGP-NEXT:    v_mul_lo_u32 v12, v15, v13
+; CGP-NEXT:    v_mul_lo_u32 v14, v18, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v15, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v17, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v18, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v12, v18, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v4
+; CGP-NEXT:    v_addc_u32_e32 v18, vcc, v18, v10, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v15, 0
 ; CGP-NEXT:    v_mov_b32_e32 v4, v11
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v14
-; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v14, vcc
-; CGP-NEXT:    v_xor_b32_e32 v12, v4, v14
-; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
-; CGP-NEXT:    v_mul_lo_u32 v9, v13, v11
-; CGP-NEXT:    v_xor_b32_e32 v15, v8, v14
-; CGP-NEXT:    v_mul_hi_u32 v8, v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v16, v18, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v16, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v16
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v15, v[11:12]
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v16, vcc
+; CGP-NEXT:    v_xor_b32_e32 v14, v4, v16
+; CGP-NEXT:    v_mul_lo_u32 v4, v18, v10
+; CGP-NEXT:    v_mul_lo_u32 v9, v15, v13
+; CGP-NEXT:    v_xor_b32_e32 v17, v8, v16
+; CGP-NEXT:    v_mul_hi_u32 v8, v15, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v18, v10
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v16, v11
+; CGP-NEXT:    v_mul_lo_u32 v8, v18, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
+; CGP-NEXT:    v_mul_hi_u32 v9, v15, v13
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v16, v11
+; CGP-NEXT:    v_mul_hi_u32 v10, v18, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v16, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v15, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v12, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
-; CGP-NEXT:    v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v18, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v17, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v14, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v14, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v17, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v17, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT:    v_mul_hi_u32 v10, v14, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v17, v8
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v13, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v4
 ; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
-; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
-; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v11, v[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v14, v8
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v1, v13, v[9:10]
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v17, v11, vcc
+; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v17, v11
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v0
@@ -2754,11 +2754,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v1
 ; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v9, v1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
@@ -2766,10 +2766,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v14
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v14
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v16
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v16
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v16, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr11_vgpr12
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
@@ -2819,117 +2819,117 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_xor_b32_e32 v3, v4, v3
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT:    v_sub_i32_e32 v12, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v2
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v6
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v11, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
 ; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
 ; CGP-NEXT:    v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v8
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v14, v11, v9
-; CGP-NEXT:    v_mul_lo_u32 v15, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v6, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v6, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
 ; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v11, 0
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v13, 0
 ; CGP-NEXT:    v_mov_b32_e32 v4, v9
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v12, v6, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v12, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v12
-; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v13, v11, v[9:10]
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v7, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v4, v12
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v6, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v14
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[9:10]
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v7, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v10, v4, v14
 ; CGP-NEXT:    v_mul_lo_u32 v4, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, v11, v9
-; CGP-NEXT:    v_xor_b32_e32 v13, v5, v12
-; CGP-NEXT:    v_mul_hi_u32 v5, v11, v8
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v11
+; CGP-NEXT:    v_xor_b32_e32 v12, v5, v14
+; CGP-NEXT:    v_mul_hi_u32 v5, v13, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v5, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v9
+; CGP-NEXT:    v_mul_lo_u32 v5, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_mul_hi_u32 v7, v13, v11
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_mul_hi_u32 v8, v6, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
 ; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v13, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v5
+; CGP-NEXT:    v_mul_lo_u32 v6, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v10, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v10, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v12, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v13, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT:    v_mul_lo_u32 v8, v12, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_mul_hi_u32 v7, v10, v5
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v4, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v6
+; CGP-NEXT:    v_mul_hi_u32 v8, v12, v5
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v11, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[5:6]
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[5:6]
-; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v13, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v3
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v6
+; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[5:6]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[6:7]
+; CGP-NEXT:    v_subb_u32_e64 v5, s[4:5], v12, v8, vcc
+; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v12, v8
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v3
-; CGP-NEXT:    v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v3
+; CGP-NEXT:    v_subb_u32_e32 v6, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v4, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v6, v3, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
@@ -2938,11 +2938,11 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v12
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v12
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v14
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v14
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v14, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr9_vgpr10
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:    s_andn2_saveexec_b64 s[4:5], s[6:7]
@@ -3004,15 +3004,15 @@ define i64 @v_srem_i64_24bit(i64 %num, i64 %den) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v1, v3
-; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
 ; CGP-NEXT:    v_rcp_f32_e32 v1, v1
 ; CGP-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v1
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v3
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v1, 0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v2
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v1, v4
+; CGP-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v5, 0
+; CGP-NEXT:    v_and_b32_e32 v5, 0xffffff, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v2, 0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v1, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v5, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v3
@@ -3035,196 +3035,196 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xffffff, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v7, 0
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v7
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v9, 0
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GISEL-NEXT:    v_trunc_f32_e32 v5, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v3
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v3
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v11, v[4:5]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v8, v[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v10, 0xffffff, v0
-; GISEL-NEXT:    v_mul_lo_u32 v0, v11, v3
-; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v3
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v10, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v11, v13, v[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v11, 0xffffff, v0
+; GISEL-NEXT:    v_mul_lo_u32 v0, v13, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v3
+; GISEL-NEXT:    v_mul_hi_u32 v3, v13, v3
+; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v11, v3, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v13, v3, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v4, 0, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v7
 ; GISEL-NEXT:    v_and_b32_e32 v3, 0xffffff, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, 0, v0
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v7
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v0, v4
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v8
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v7
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v1, v12, 0
+; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v9
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v6, v[0:1]
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v11, v7
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v0
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, 0, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v13, v12, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v9, v[5:6]
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v7
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v0
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v14, v13, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], 0, v12, v[5:6]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v12, v[8:9]
-; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v4
-; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], 0, v5
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v7
-; GISEL-NEXT:    v_mul_lo_u32 v5, v12, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v7
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v16, v[0:1]
+; GISEL-NEXT:    v_subb_u32_e64 v15, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v11, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v13, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v7
+; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v7
+; GISEL-NEXT:    v_mul_lo_u32 v5, v13, v10
+; GISEL-NEXT:    v_subb_u32_e64 v17, s[4:5], 0, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], 0, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
+; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v7
+; GISEL-NEXT:    v_mul_lo_u32 v8, v16, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v7
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v8
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v10
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v8
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v8, v16, v10
 ; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v7, v5
 ; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v4
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v11, v5, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v13, v7, 0
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v13, v4
+; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v16, v5, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v9, 0
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v17
+; GISEL-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v0, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, -1, v6, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v8, v[0:1]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v7, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v4
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, -1, v6, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v14, v10, v[0:1]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v12, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v15, v9, v[5:6]
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, -1, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v8, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, -1, v5, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v10, v5, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v5, 0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v7
+; GISEL-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v7, 0, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v4, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v8, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v4, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v6
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v6, v[0:1]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], 0, v8, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v15, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[0:1]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v13, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v12, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v9, v[5:6]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], 0, v7
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], 0, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
@@ -3264,15 +3264,15 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, 0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v5, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v5, 0
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, 0, v4
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v5, v1, v3
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v0, 0
-; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v6, v1
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; CGP-NEXT:    v_mul_lo_u32 v5, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v7, v1, v3
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v6, 0
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v5, v3
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
 ; CGP-NEXT:    v_mul_lo_u32 v6, v1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 9e412b6c7cd0a..23ef596c021c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -132,65 +132,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -203,54 +202,55 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v4, 0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v4, s9, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s10, v6, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s10, v3, v[1:2]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s8, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s11, v4, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s10, v7, v[1:2]
+; GFX8-NEXT:    v_mov_b32_e32 v1, s11
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s11, v6, v[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
-; GFX8-NEXT:    v_mov_b32_e32 v5, s11
-; GFX8-NEXT:    v_subb_u32_e64 v7, s[0:1], v2, v1, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s9, v1
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s8, v0
+; GFX8-NEXT:    v_subb_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v0, s[0:1], s9, v4
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v7
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s10, v6
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s10, v3
 ; GFX8-NEXT:    v_subbrev_u32_e64 v8, s[0:1], 0, v0, vcc
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v4
-; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
+; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], 1, v6
+; GFX8-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v7, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v2
-; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v5, vcc
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v4
+; GFX8-NEXT:    v_subb_u32_e32 v0, vcc, v0, v1, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s10, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v14, vcc, s10, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
-; GFX8-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GFX8-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v0, vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v15, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v10, v13, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s5
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v14, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX8-NEXT:    s_endpgm
@@ -271,63 +271,64 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s16, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v6, s17, v1
+; GFX9-NEXT:    v_mul_hi_u32 v5, s17, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -339,53 +340,52 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v5, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s18, v6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v3, v3, v2, v6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s18, v3, v[1:2]
-; GFX9-NEXT:    v_sub_co_u32_e32 v7, vcc, s16, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s19, v5, v[1:2]
+; GFX9-NEXT:    v_add3_u32 v7, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s18, v7, v[1:2]
+; GFX9-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s19, v6, v[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s17
-; GFX9-NEXT:    v_mov_b32_e32 v4, s19
-; GFX9-NEXT:    v_subb_co_u32_e64 v8, s[0:1], v2, v1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT:    v_sub_u32_e32 v0, s17, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v7
+; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s16, v0
+; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v2, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT:    v_sub_u32_e32 v0, s17, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v8
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s18, v7
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v5
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s18, v3
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v9, s[0:1], 0, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], 1, v6
+; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v7, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s19, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s18, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v1, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s19, v9
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s18, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s18, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v15, vcc, 0, v0, vcc
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v16, vcc, 0, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v15, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[0:1]
-; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[12:13]
-; GFX9-NEXT:    global_store_dwordx2 v6, v[2:3], s[14:15]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v11, v14, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v16, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v4, s[0:1]
+; GFX9-NEXT:    global_store_dwordx2 v8, v[0:1], s[12:13]
+; GFX9-NEXT:    global_store_dwordx2 v8, v[2:3], s[14:15]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: udivrem_i64:
@@ -1005,72 +1005,72 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v10, s13
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX8-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v7, v5
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX8-NEXT:    v_mul_hi_u32 v6, v3, v0
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v7, v1, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX8-NEXT:    v_mul_lo_u32 v1, v7, v0
 ; GFX8-NEXT:    s_sub_u32 s2, 0, s14
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX8-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX8-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX8-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX8-NEXT:    s_subb_u32 s3, 0, s15
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
-; GFX8-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v2
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
-; GFX8-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
@@ -1083,136 +1083,136 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s9, v1
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v7, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s12, v8, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s12, v8, v[1:2]
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s13, v7, v[1:2]
-; GFX8-NEXT:    v_mov_b32_e32 v3, s9
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v4, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s12, v9, v[1:2]
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s8, v0
-; GFX8-NEXT:    v_subb_u32_e64 v0, s[0:1], v3, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v2
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s13, v8, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s9
+; GFX8-NEXT:    v_subb_u32_e64 v0, s[0:1], v2, v4, vcc
+; GFX8-NEXT:    v_sub_u32_e64 v2, s[0:1], s9, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v3, v4, s[0:1]
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v3, s15
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s14
-; GFX8-NEXT:    v_subb_u32_e32 v6, vcc, v2, v5, vcc
+; GFX8-NEXT:    v_subb_u32_e32 v12, vcc, v2, v10, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s12, v1
-; GFX8-NEXT:    v_subbrev_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v13, vcc, s12, v1
+; GFX8-NEXT:    v_subbrev_u32_e64 v14, s[0:1], 0, v12, vcc
 ; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX8-NEXT:    v_trunc_f32_e32 v4, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v4
 ; GFX8-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GFX8-NEXT:    v_add_u32_e64 v13, s[0:1], 1, v7
-; GFX8-NEXT:    v_addc_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v4
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v6, v5, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v4, v15, v2
-; GFX8-NEXT:    v_mul_lo_u32 v17, v12, v3
-; GFX8-NEXT:    v_mul_hi_u32 v6, v12, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, v15, v2
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v15, v2
+; GFX8-NEXT:    v_add_u32_e64 v16, s[0:1], 1, v8
+; GFX8-NEXT:    v_addc_u32_e64 v17, s[0:1], 0, v9, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v18, v4
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v13
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX8-NEXT:    v_mul_lo_u32 v4, v18, v2
+; GFX8-NEXT:    v_subb_u32_e32 v7, vcc, v12, v10, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v5, v15, v6
+; GFX8-NEXT:    v_mul_hi_u32 v10, v15, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, v18, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v14
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v10
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v15, v3
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v17, v4
-; GFX8-NEXT:    v_mul_hi_u32 v17, v12, v3
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v6, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v17
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v17
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 1, v13
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v14, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v10
-; GFX8-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v5, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v10, v18, v6
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_mul_hi_u32 v5, v15, v6
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v10, v5
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 1, v16
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v17, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX8-NEXT:    v_subrev_u32_e32 v19, vcc, s12, v13
+; GFX8-NEXT:    v_mul_hi_u32 v6, v18, v6
+; GFX8-NEXT:    v_subbrev_u32_e32 v20, vcc, 0, v7, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, v15, v3, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v13, v17, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v2
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, v18, v4, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v16, v10, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v15, v[2:3]
-; GFX8-NEXT:    v_cndmask_b32_e32 v13, v14, v18, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s3, v12, v[5:6]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v3, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v13, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v7, v15, v4
-; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v5
-; GFX8-NEXT:    v_mul_hi_u32 v9, v12, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v10, v19, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v11, v20, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v3, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[7:8], s[2:3], s3, v15, v[5:6]
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v12, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, v18, v4
+; GFX8-NEXT:    v_mul_lo_u32 v8, v15, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v9, v10, s[0:1]
+; GFX8-NEXT:    v_mul_hi_u32 v9, v15, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v19, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v14, v20, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v9, v15, v5
-; GFX8-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v9, v18, v7
+; GFX8-NEXT:    v_mul_hi_u32 v4, v18, v4
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    v_mul_hi_u32 v8, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
-; GFX8-NEXT:    v_mul_hi_u32 v5, v15, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v7
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v12, v4
-; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v15, v5, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v4
-; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v6, s[0:1]
+; GFX8-NEXT:    v_mul_hi_u32 v7, v18, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v15, v4
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, v18, v6, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v8, s11, v4
+; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v1, v5, s[0:1]
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s10, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, s11, v4
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v8, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v5
+; GFX8-NEXT:    v_mul_lo_u32 v5, s11, v7
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v5
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v8, s10, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v5, v8
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v4, v1
-; GFX8-NEXT:    v_mul_hi_u32 v8, s11, v5
+; GFX8-NEXT:    v_mul_hi_u32 v7, s11, v7
 ; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s14, v11, 0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
-; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v7, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], s14, v12, v[1:2]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v0, v10, s[0:1]
@@ -1279,60 +1279,61 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, v3, v1
-; GFX9-NEXT:    v_mul_lo_u32 v7, v4, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v7, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, v6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX9-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v1, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v3, 0
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s2, v4, v[1:2]
-; GFX9-NEXT:    v_mul_hi_u32 v6, v3, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v1, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v6, 0
+; GFX9-NEXT:    v_mov_b32_e32 v8, s5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v7, v[1:2]
+; GFX9-NEXT:    v_mul_lo_u32 v1, v7, v0
 ; GFX9-NEXT:    s_sub_u32 s2, 0, s6
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s3, v3, v[1:2]
-; GFX9-NEXT:    v_mul_lo_u32 v2, v4, v0
-; GFX9-NEXT:    v_mul_hi_u32 v0, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v1
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v6, v[2:3]
+; GFX9-NEXT:    v_mul_hi_u32 v3, v6, v0
+; GFX9-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v6, v4
 ; GFX9-NEXT:    s_subb_u32 s3, 0, s7
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v4, v1
-; GFX9-NEXT:    v_add_u32_e32 v2, v5, v2
-; GFX9-NEXT:    v_mul_hi_u32 v5, v3, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, v4, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v6, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v1, v5, v2, v1
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v7, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v1, v2, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s17, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s16, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
@@ -1349,114 +1350,113 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v0, v2
-; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s4, v7, 0
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v0, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s4, v9, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_add3_u32 v8, v3, v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s4, v8, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v5, s5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v7, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v3, s17
+; GFX9-NEXT:    v_add3_u32 v10, v3, v2, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s4, v10, v[1:2]
 ; GFX9-NEXT:    v_sub_co_u32_e32 v1, vcc, s16, v0
-; GFX9-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v3, v2, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s5, v9, v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s17
+; GFX9-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v2, v4, vcc
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v1
+; GFX9-NEXT:    v_sub_u32_e32 v2, s17, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v0
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v3, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, v3, v4, s[0:1]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s7
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GFX9-NEXT:    v_sub_u32_e32 v2, s17, v2
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v2, v5, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v12, vcc, v2, v8, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v3
 ; GFX9-NEXT:    v_add_f32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s4, v1
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[0:1], 0, v6, vcc
+; GFX9-NEXT:    v_subrev_co_u32_e32 v13, vcc, s4, v1
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v14, s[0:1], 0, v12, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v7
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v8, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v12, 0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v4
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[0:1]
-; GFX9-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v15, v[3:4]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s3, v12, v[3:4]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v15, v2
-; GFX9-NEXT:    v_mul_lo_u32 v17, v12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v5, v12, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, v15, v2
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v2
+; GFX9-NEXT:    v_add_co_u32_e64 v16, s[0:1], 1, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v17, s[0:1], 0, v10, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, 0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v18, v4
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s5, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v18, v[3:4]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s4, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v15, v[4:5]
+; GFX9-NEXT:    v_mul_lo_u32 v4, v18, v2
+; GFX9-NEXT:    v_subb_co_u32_e32 v7, vcc, v12, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v5, v15, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v15, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v18, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s5, v14
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v15, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, v17, v4
-; GFX9-NEXT:    v_mul_hi_u32 v17, v12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v15, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX9-NEXT:    v_mul_lo_u32 v8, v18, v6
+; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX9-NEXT:    v_mul_hi_u32 v5, v15, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v18, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v17
-; GFX9-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v17
-; GFX9-NEXT:    v_add_co_u32_e32 v17, vcc, 1, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v14, vcc
+; GFX9-NEXT:    v_add_u32_e32 v5, v8, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v16
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v17, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v12, v2
-; GFX9-NEXT:    v_add3_u32 v3, v5, v4, v3
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v12, 0
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
+; GFX9-NEXT:    v_add3_u32 v4, v5, v4, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v15, vcc, v15, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v19, v3, s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e32 v18, vcc, v18, v4, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v15, 0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s4, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v5
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s2, v15, v[2:3]
-; GFX9-NEXT:    v_subrev_co_u32_e32 v19, vcc, s4, v10
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v6, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s3, v12, v[2:3]
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v7, v13, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, v12, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v15, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v14, v18, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[2:3], v6, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[2:3], v6, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s2, v18, v[2:3]
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v20, vcc, 0, v7, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v16, v8, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[7:8], s[0:1], s3, v15, v[5:6]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT:    v_mul_lo_u32 v5, v18, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v15, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
+; GFX9-NEXT:    v_mul_hi_u32 v9, v15, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v18, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[2:3], v5, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v9, v15, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_mul_hi_u32 v7, v12, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v15, v5
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[2:3], v5, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v9, v18, v7
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_mul_hi_u32 v6, v15, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, v18, v7
 ; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v9, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add3_u32 v5, v7, v6, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v12, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[2:3], v15, v5, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, v9, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX9-NEXT:    v_add3_u32 v5, v6, v5, v7
+; GFX9-NEXT:    v_add_co_u32_e64 v4, s[2:3], v15, v4
+; GFX9-NEXT:    v_addc_co_u32_e64 v5, s[2:3], v18, v5, s[2:3]
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s19, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s18, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s18, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v14, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v10, v19, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v20, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v17, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v10, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v13, v19, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v14, v20, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
new file mode 100644
index 0000000000000..d7d623ac89146
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-intrinsic-missing-nocallback.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+; Make sure we do not infer anything about implicit inputs through an
+; intrinsic call which is not nocallback.
+
+declare zeroext i32 @return_i32()
+
+define i32 @test_i32_return() gc "statepoint-example" {
+; CHECK-LABEL: define i32 @test_i32_return(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] gc "statepoint-example" {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+; CHECK-NEXT:    [[CALL1:%.*]] = call zeroext i32 @llvm.experimental.gc.result.i32(token [[SAFEPOINT_TOKEN]])
+; CHECK-NEXT:    ret i32 [[CALL1]]
+;
+entry:
+  %safepoint_token = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr elementtype(i32 ()) @return_i32, i32 0, i32 0, i32 0, i32 0)
+  %call1 = call zeroext i32 @llvm.experimental.gc.result.i32(token %safepoint_token)
+  ret i32 %call1
+}
+
+declare token @llvm.experimental.gc.statepoint.p0(i64 immarg, i32 immarg, ptr, i32 immarg, i32 immarg, ...)
+declare i32 @llvm.experimental.gc.result.i32(token) #0
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+;.
+; CHECK: attributes #[[ATTR0]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
new file mode 100644
index 0000000000000..71c509afa8e64
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 5
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-attributor -mcpu=gfx90a %s | FileCheck %s
+
+; Make sure we infer no inputs are used through some intrinsics
+
+define void @use_fake_use(i32 %arg) {
+; CHECK-LABEL: define void @use_fake_use(
+; CHECK-SAME: i32 [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    call void (...) @llvm.fake.use(i32 [[ARG]])
+; CHECK-NEXT:    ret void
+;
+  call void (...) @llvm.fake.use(i32 %arg)
+  ret void
+}
+
+define void @use_donothing() {
+; CHECK-LABEL: define void @use_donothing(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.donothing()
+  ret void
+}
+
+define void @use_assume(i1 %arg) {
+; CHECK-LABEL: define void @use_assume(
+; CHECK-SAME: i1 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ARG]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.assume(i1 %arg)
+  ret void
+}
+
+define void @use_trap() {
+; CHECK-LABEL: define void @use_trap(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap()
+  ret void
+}
+
+define void @use_debugtrap() {
+; CHECK-LABEL: define void @use_debugtrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.debugtrap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.debugtrap()
+  ret void
+}
+
+define void @use_ubsantrap() {
+; CHECK-LABEL: define void @use_ubsantrap(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.ubsantrap(i8 0)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.ubsantrap(i8 0)
+  ret void
+}
+
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
new file mode 100644
index 0000000000000..06150e4277e9a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s
+
+; These situations are "special" in that they either have an alloca that is not
+; in the entry block or that they have a dynamic alloca. Both situations affect
+; prolog/epilog generation.
+
+declare amdgpu_gfx void @foo()
+
+define amdgpu_cs_chain void @test_alloca() {
+; GFX12-LABEL: test_alloca:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s0, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s0, 0x200
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_add_i32 s32, s0, 0x400
+; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 6
+; GFX942-NEXT:    s_mov_b32 s1, s32
+; GFX942-NEXT:    s_add_i32 s32, s1, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, s1
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
+; GFX12-LABEL: test_alloca_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call() {
+; GFX12-LABEL: test_alloca_and_call:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s2, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s2, 0x200
+; GFX12-NEXT:    scratch_store_b32 off, v0, s2
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s2, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_add_i32 s32, s2, 0x400
+; GFX942-NEXT:    scratch_store_dword off, v0, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_and_call_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    s_add_i32 s32, s3, s2
+; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
+; GFX12-LABEL: test_alloca_and_call_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca() {
+; GFX12-LABEL: test_call_and_alloca:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s4, 0x200
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    scratch_store_b32 off, v0, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    s_add_i32 s32, s4, 0x400
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    scratch_store_dword off, v0, s4
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_call_and_alloca_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s4, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    v_mov_b32_e32 v40, 0
+; GFX942-NEXT:    s_add_i32 s32, s4, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) {
+; GFX12-LABEL: test_call_and_alloca_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, s0, 5, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v40, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    v_lshl_add_u32 v0, s2, 6, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index f6ae516faf2f7..89d039406f375 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -1489,7 +1489,7 @@ attributes #2 = { noinline }
 !0 = !{float 3.0}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { strictfp }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }
 ; CHECK: attributes #[[ATTR3]] = { noinline }
 ; CHECK: attributes #[[ATTR4]] = { nobuiltin }
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
new file mode 100644
index 0000000000000..253a6ec100eae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
+; CHECK-LABEL: callbr_inline_asm:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_dword v0, v[0:1]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %fallthrough
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[2:3], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-NEXT:  .LBB0_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %indirect
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[4:5], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+	%a = load i32, ptr %src, align 4
+	callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
+fallthrough:
+	store i32 %a, ptr %dst1, align 4
+	br label %ret
+indirect:
+	store i32 %a, ptr %dst2, align 4
+	br label %ret
+ret:
+	ret void
+}
+
+define void @callbr_self_loop(i1 %c) {
+; CHECK-LABEL: callbr_self_loop:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:  .LBB1_1: ; %callbr
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_1
+; CHECK-NEXT:  .LBB1_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %callbr.target.ret
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  br label %callbr
+callbr:
+  callbr void asm "", "!i"() to label %callbr [label %ret]
+ret:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 0fc54aeaef77b..26f77898faf60 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -2407,51 +2407,52 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
-; GISEL-NEXT:    v_mul_lo_u32 v24, v30, v19
-; GISEL-NEXT:    v_mul_lo_u32 v25, v29, v18
+; GISEL-NEXT:    v_mul_lo_u32 v27, v30, v19
+; GISEL-NEXT:    v_mul_lo_u32 v36, v29, v18
 ; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0
-; GISEL-NEXT:    v_mul_lo_u32 v26, v35, v3
-; GISEL-NEXT:    v_mul_lo_u32 v27, v34, v2
+; GISEL-NEXT:    v_mul_lo_u32 v37, v35, v3
+; GISEL-NEXT:    v_mul_lo_u32 v38, v34, v2
 ; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15]
 ; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23]
 ; GISEL-NEXT:    v_mov_b32_e32 v22, v19
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3]
-; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15]
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2]
-; GISEL-NEXT:    v_mov_b32_e32 v23, v14
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23]
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2]
-; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23]
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v25, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[23:24], s[4:5], v10, v31, v[2:3]
+; GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v4, v20, v[14:15]
+; GISEL-NEXT:    v_mov_b32_e32 v2, v23
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], vcc, v30, v32, v[1:2]
+; GISEL-NEXT:    v_mov_b32_e32 v23, v25
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v35, v21, v[22:23]
+; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[6:7], v29, v31, v[14:15]
+; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], v24, v27, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[6:7], v34, v20, v[1:2]
+; GISEL-NEXT:    v_addc_u32_e64 v2, s[6:7], v26, v37, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v3, v36, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v16, v0
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v17, v1, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v15, v0, v28
-; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v12, s[4:5], v12, v18
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5]
-; GISEL-NEXT:    v_xor_b32_e32 v16, v12, v33
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v28
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1]
-; GISEL-NEXT:    v_xor_b32_e32 v14, v14, v33
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v15, v28
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, v17, v22, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v19, v0, v28
+; GISEL-NEXT:    v_addc_u32_e64 v0, s[4:5], v2, v38, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v12, v18
+; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v13, v14, s[4:5]
+; GISEL-NEXT:    v_xor_b32_e32 v18, v2, v33
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v10, v32, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v16, v28
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v4, v21, v[0:1]
+; GISEL-NEXT:    v_xor_b32_e32 v10, v14, v33
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v11, v31, v[2:3]
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[6:7], v19, v28
 ; GISEL-NEXT:    v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4]
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[8:9], v16, v33
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9]
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v28
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v23, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v5, v20, v[12:13]
+; GISEL-NEXT:    v_sub_i32_e64 v4, s[8:9], v18, v33
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[8:9], v10, v33, s[8:9]
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v8, v23, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, v9, v16, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v28
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v15, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v7, v2, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v6, v33
 ; GISEL-NEXT:    v_xor_b32_e32 v7, v8, v28
-; GISEL-NEXT:    v_xor_b32_e32 v8, v3, v33
-; GISEL-NEXT:    v_subb_u32_e64 v2, vcc, v2, v28, s[6:7]
+; GISEL-NEXT:    v_xor_b32_e32 v8, v2, v33
+; GISEL-NEXT:    v_subb_u32_e64 v2, vcc, v3, v28, s[6:7]
 ; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v7, v28, vcc
 ; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v8, v33, vcc
@@ -3216,36 +3217,38 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
-; GISEL-NEXT:    v_mul_lo_u32 v28, v8, v21
-; GISEL-NEXT:    v_mul_lo_u32 v29, v9, v20
+; GISEL-NEXT:    v_mul_lo_u32 v34, v8, v21
+; GISEL-NEXT:    v_mul_lo_u32 v35, v9, v20
 ; GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0
 ; GISEL-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0
-; GISEL-NEXT:    v_mul_lo_u32 v30, v12, v19
-; GISEL-NEXT:    v_mul_lo_u32 v31, v13, v18
+; GISEL-NEXT:    v_mul_lo_u32 v36, v12, v19
+; GISEL-NEXT:    v_mul_lo_u32 v37, v13, v18
 ; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23]
 ; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27]
-; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19]
-; GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23]
-; GISEL-NEXT:    v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18]
-; GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22]
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18]
-; GISEL-NEXT:    v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22]
-; GISEL-NEXT:    v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v29, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[26:27], s[4:5], v10, v32, v[18:19]
+; GISEL-NEXT:    v_mad_u64_u32 v[28:29], s[4:5], v14, v24, v[22:23]
+; GISEL-NEXT:    v_mov_b32_e32 v18, v26
+; GISEL-NEXT:    v_mad_u64_u32 v[30:31], vcc, v8, v33, v[17:18]
+; GISEL-NEXT:    v_mov_b32_e32 v22, v28
+; GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v12, v25, v[21:22]
+; GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[6:7], v9, v32, v[30:31]
+; GISEL-NEXT:    v_addc_u32_e64 v12, s[6:7], v27, v34, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v13, v24, v[17:18]
+; GISEL-NEXT:    v_addc_u32_e64 v13, s[6:7], v29, v36, s[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v12, v35, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v21, vcc
+; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], v13, v37, s[4:5]
 ; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v4, v20
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18]
-; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17]
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19]
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v9, vcc
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
-; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
-; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v5, v8, s[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[12:13]
+; GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[13:14]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[6:7], v11, v32, v[16:17]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v15, v24, v[18:19]
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v22, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v9, s[4:5]
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v10, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = urem <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 007e3f0a6bdbc..076a99ff8588f 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,6 +3,7 @@
 
 declare void @foo(ptr)
 declare i1 @bar(ptr)
+declare i32 @bar32(ptr)
 
 define void @musttail_call_without_return_value(ptr %p) {
 ; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
   ret void
 }
 
+define void @musttail_call_without_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    musttail call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  musttail call void @foo(ptr %p)
+  ret void
+
+bb.1:
+  ret void
+}
+
 define i1 @musttail_call_with_return_value(ptr %p) {
 ; CHECK-LABEL: define i1 @musttail_call_with_return_value(
 ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
 bb.1:
   ret i1 %load
 }
+
+define i32 @musttail_call_with_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  %ret = musttail call i32 @bar32(ptr %p)
+  ret i32 %ret
+
+bb.1:
+  ret i32 %load
+}
diff --git a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
index 0548bcf304c32..279f4298e6418 100644
--- a/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/finalizebundle.mir
@@ -34,3 +34,16 @@ body: |
     $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec
     dead $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
 ...
+
+---
+name: test_tied
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_tied
+    ; CHECK: BUNDLE implicit-def %0, implicit-def %2, implicit %1:vgpr_32(tied-def 1), implicit $mode, implicit $exec {
+    ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY %1:vgpr_32
+    ; CHECK-NEXT:   [[V_FMAC_F16_e32_:%[0-9]+]]:vgpr_32 = V_FMAC_F16_e32 internal [[COPY]], internal [[COPY]], %1:vgpr_32, implicit $mode, implicit $exec
+    ; CHECK-NEXT: }
+    %1:vgpr_32 = COPY %0:vgpr_32
+    %2:vgpr_32 = V_FMAC_F16_e32 %1, %1, %0, implicit $mode, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index e0421575c3174..460f1211d1386 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -237,31 +237,31 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
@@ -275,17 +275,18 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:  .LBB0_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:  .LBB0_7: ; %Flow2
@@ -604,31 +605,31 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
@@ -642,17 +643,18 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v4, v9, 0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[6:7], v6, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v1, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v6, v8, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[6:7], v7, v9, v[2:3]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT:    v_mov_b32_e32 v2, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], vcc, v6, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[6:7], v7, v8, v[3:4]
+; GISEL-NEXT:    v_addc_co_u32_e64 v3, s[6:7], v5, v6, s[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v10, vcc
 ; GISEL-NEXT:  .LBB1_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:  .LBB1_7: ; %Flow2
@@ -962,31 +964,31 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
@@ -999,12 +1001,14 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GISEL-NEXT:  .LBB2_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:  .LBB2_7: ; %Flow2
@@ -1314,31 +1318,31 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v10, v8, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0
 ; GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v10, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v10, v8, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v11, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v9, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v11, v9, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v9, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v8, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v9, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6
 ; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
@@ -1351,12 +1355,14 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v4, v8, 0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, v1
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v8
+; GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], vcc, v6, v8, v[4:5]
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GISEL-NEXT:  .LBB3_6: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:  .LBB3_7: ; %Flow2
@@ -1702,31 +1708,31 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v10, 0xffffff7a, v5
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v10, v[6:7]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v10
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[6:7]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB6_4: ; %Flow
@@ -2050,31 +2056,31 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:    v_add_u32_e32 v10, 0xffffff7a, v5
 ; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v10, v[6:7]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v11, v9, 0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v12, v9, 0
 ; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffff3a, v5
 ; GISEL-NEXT:    v_sub_u32_e32 v2, 64, v10
 ; GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[6:7]
 ; GISEL-NEXT:    v_lshlrev_b64 v[4:5], v4, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v2, 0, s[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v8, v[6:7]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
-; GISEL-NEXT:    v_mov_b32_e32 v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v12, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v8, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v12, v8, 0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v13, v9
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
-; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[10:11], v13, v8, v[6:7]
+; GISEL-NEXT:    v_addc_co_u32_e64 v6, s[10:11], v11, v10, s[10:11]
 ; GISEL-NEXT:    v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v10, v9, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v14, v9, v[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v5, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr5
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr6_vgpr7
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
 ; GISEL-NEXT:  .LBB7_4: ; %Flow
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e2e43faca5aa..df635925b87df 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,26 +36,60 @@ loop:
   br label %loop
 }
 
+define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP:%.*]] []
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", ""() to label %loop []
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+}
+
 define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB1_3
+; SI-NEXT:    s_cbranch_execz .LBB2_3
 ; SI-NEXT:  ; %bb.1: ; %loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB1_2: ; %loop
+; SI-NEXT:  .LBB2_2: ; %loop
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB1_2
-; SI-NEXT:  .LBB1_3: ; %UnifiedReturnBlock
+; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:  .LBB2_3: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_ret(
 ; IR-NEXT:  entry:
@@ -81,44 +115,93 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:  .LBB3_2: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    [[COND32:%.*]] = zext i1 [[COND]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND32]])
+; IR-NEXT:            to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond = icmp eq i32 %tmp, 1
+  %cond32 = zext i1 %cond to i32
+  callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+
+return:
+  ret void
+}
+
 define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loops:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    s_cbranch_scc1 .LBB2_4
+; SI-NEXT:    s_cbranch_scc1 .LBB4_4
 ; SI-NEXT:  ; %bb.1:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x378
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB2_2: ; %loop2
+; SI-NEXT:  .LBB4_2: ; %loop2
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:    s_cbranch_vccnz .LBB4_2
 ; SI-NEXT:  ; %bb.3: ; %Flow
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB2_4: ; %Flow2
+; SI-NEXT:  .LBB4_4: ; %Flow2
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_7
+; SI-NEXT:    s_cbranch_vccz .LBB4_7
 ; SI-NEXT:  ; %bb.5:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
-; SI-NEXT:  .LBB2_6: ; %loop1
+; SI-NEXT:  .LBB4_6: ; %loop1
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_6
-; SI-NEXT:  .LBB2_7: ; %DummyReturnBlock
+; SI-NEXT:    s_cbranch_vccz .LBB4_6
+; SI-NEXT:  .LBB4_7: ; %DummyReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loops(
 ; IR-NEXT:  entry:
@@ -144,24 +227,78 @@ loop2:
   br label %loop2
 }
 
+define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loops_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop1
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  .LBB5_2: ; Inline asm indirect target
+; SI-NEXT:    ; %loop2.preheader
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x378
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loops_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 poison)
+; IR-NEXT:            to label [[LOOP1:%.*]] [label %loop2]
+; IR:       loop1:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP1]] []
+; IR:       loop2:
+; IR-NEXT:    store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
+; IR:       TransitionBlock1:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP2:%.*]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
+
+loop1:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop1 []
+
+loop2:
+  store volatile i32 888, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop2 []
+}
+
 define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_nest_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB3_5
+; SI-NEXT:    s_cbranch_execz .LBB6_5
 ; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:  .LBB3_2: ; %outer_loop
+; SI-NEXT:  .LBB6_2: ; %outer_loop
 ; SI-NEXT:    ; =>This Loop Header: Depth=1
-; SI-NEXT:    ; Child Loop BB3_3 Depth 2
+; SI-NEXT:    ; Child Loop BB6_3 Depth 2
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB3_3: ; %inner_loop
-; SI-NEXT:    ; Parent Loop BB3_2 Depth=1
+; SI-NEXT:  .LBB6_3: ; %inner_loop
+; SI-NEXT:    ; Parent Loop BB6_2 Depth=1
 ; SI-NEXT:    ; => This Inner Loop Header: Depth=2
 ; SI-NEXT:    s_and_b64 s[8:9], exec, s[0:1]
 ; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
@@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_cbranch_execnz .LBB3_3
+; SI-NEXT:    s_cbranch_execnz .LBB6_3
 ; SI-NEXT:  ; %bb.4: ; %loop.exit.guard
-; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; SI-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:    s_branch .LBB3_2
-; SI-NEXT:  .LBB3_5: ; %UnifiedReturnBlock
+; SI-NEXT:    s_branch .LBB6_2
+; SI-NEXT:  .LBB6_5: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_nest_ret(
 ; IR-NEXT:  entry:
@@ -212,4 +349,82 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_nest_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 s[0:1], exec, 0
+; SI-NEXT:    s_branch .LBB7_3
+; SI-NEXT:  .LBB7_2: ; %loop.exit.guard
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccnz .LBB7_5
+; SI-NEXT:  .LBB7_3: ; %outer_loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], -1
+; SI-NEXT:    s_mov_b64 vcc, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB7_2
+; SI-NEXT:  ; %bb.4: ; %TransitionBlock.target.outer_loop
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_branch .LBB7_2
+; SI-NEXT:  .LBB7_5: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_nest_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND1:%.*]] = icmp ne i32 [[TMP]], 1
+; IR-NEXT:    [[COND1_32:%.*]] = zext i1 [[COND1]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND1_32]])
+; IR-NEXT:            to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       outer_loop:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[INNER_LOOP:%.*]] []
+; IR:       inner_loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
+; IR-NEXT:    [[COND3_32:%.*]] = zext i1 [[COND3]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND3_32]])
+; IR-NEXT:            to label [[INNER_LOOP]] [label %outer_loop]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond1 = icmp ne i32 %tmp, 1  ; avoid following BB optimizing away through the domination
+  %cond1_32 = zext i1 %cond1 to i32
+  callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return]
+
+outer_loop:
+  ; %cond2 = icmp eq i32 %tmp, 2
+  ; br i1 %cond2, label %outer_loop, label %inner_loop
+  callbr void asm "", ""() to label %inner_loop []
+
+inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  %cond3 = icmp eq i32 %tmp, 3
+  %cond3_32 = zext i1 %cond3 to i32
+  callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop]
+
+return:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 31b6b533866d4..f705a2ffc4f1d 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -5775,28 +5775,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX7-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v8
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5831,28 +5831,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, v0, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v4, v7, vcc
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v6, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 1, v5
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v2, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, v0, v8
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v6, v9, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v7, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v8, vcc, 1, v4
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5883,28 +5883,28 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, v[1:2]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v2, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, v4, v7, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v9, v2, v[6:7]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v6, 0
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v5
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v4
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v1, v[0:1]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v7, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v6, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v4, v7, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v8
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, v6, v9, vcc
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v5
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v10, v3, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v2, v[7:8]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v1, v[0:1]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v9, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v7, v[5:6]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v10, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v8, v[4:5]
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX90A-SDAG-LABEL: clpeak_imad_pat_i64:
@@ -5935,29 +5935,29 @@ define i64 @clpeak_imad_pat_i64(i64 %x, i64 %y) {
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_i64:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v2, v[4:5]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, v0, v6
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v7, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v2, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v6, v3, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v2, v[6:7]
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v5, v5, v2
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v6, v[0:1]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v4, v3, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v7, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v8, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, v[2:3]
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v2, v[4:5]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, v0, v8
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v10, vcc, v1, v9, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v8, v3, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v8, v2, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v2, v[6:7]
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v6, v5, v8
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v6, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v7, v[0:1]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v6, v3, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v9, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v8, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v8, v[4:5]
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6408,52 +6408,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX7-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v2
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v2
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v12
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v17, vcc, v2, v14
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v0, v14
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, v2, v16
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v12, vcc
 ; GFX7-GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v2
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v10
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX7-GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v13
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX7-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v14, vcc, 1, v9
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX7-GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v13
+; GFX7-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6513,52 +6513,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX8-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v2
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v1, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 1, v2
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, v0, v12
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v8, v13, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v17, vcc, v2, v14
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v3, vcc, v0, v14
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v12, v15, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, v2, v16
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, v9, v15, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v11, vcc, 1, v0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v19, vcc, v8, v17, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v10, vcc, 1, v0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v12, vcc
 ; GFX8-GISEL-NEXT:    v_add_u32_e32 v12, vcc, 1, v2
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v8, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v2, vcc, 0, v9, vcc
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v10
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v3, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX8-GISEL-NEXT:    v_add_u32_e32 v16, vcc, 1, v13
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX8-GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v4, vcc
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v14, vcc, 1, v9
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v15, vcc, 0, v16, vcc
+; GFX8-GISEL-NEXT:    v_add_u32_e32 v17, vcc, 1, v13
+; GFX8-GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v3, vcc
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6610,52 +6610,52 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX900-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX900-GISEL:       ; %bb.0: ; %entry
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v2
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v5, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 1, v2
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v14, v4, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v5, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[8:9]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v12
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v15, v6, v[10:11]
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, v8, v13, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v4, 0
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v17, vcc, v2, v14
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v6, 0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v3, v5, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v4, v[8:9]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v14
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v17, v6, v[10:11]
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v18, vcc, v12, v15, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v3, v4, 0
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, v2, v16
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v16, v6, 0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v10
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v5, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v14
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v18, vcc, v9, v15, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v17, v7, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v16, v4, v[11:12]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v18, v6, v[14:15]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v11, vcc, 1, v0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v10, v11, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v16, v7, v[1:2]
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v19, vcc, v8, v17, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v18, v4, v[10:11]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v19, v6, v[14:15]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v10, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v12, vcc
 ; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v5
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v6, v[0:1]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v13, v12, 0
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v8, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v16, v10, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v6
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v1, v[0:1]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v12, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v9, vcc
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v10
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v3, v11, v[0:1]
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, v7
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v13, v2, v[0:1]
-; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 1, v13
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v14, 0
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v12, v[2:3]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v16, 0
-; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v4, vcc
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v15, v[1:2]
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v9
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v11, v[0:1]
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v16, vcc
+; GFX900-GISEL-NEXT:    v_add_co_u32_e32 v17, vcc, 1, v13
+; GFX900-GISEL-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v3, vcc
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v14, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v3, v12, v[8:9]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v17, 0
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v15, v[1:2]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v3
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v17, v[1:2]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v14, v[4:5]
-; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v16, v[10:11]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v18, v[1:2]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v14, v[8:9]
+; GFX900-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v10, v17, v[11:12]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6707,54 +6707,54 @@ define <2 x i64> @clpeak_imad_pat_v2i64(<2 x i64> %x, <2 x i64> %y) {
 ; GFX90A-GISEL-LABEL: clpeak_imad_pat_v2i64:
 ; GFX90A-GISEL:       ; %bb.0: ; %entry
 ; GFX90A-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v2
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v4, v[2:3]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, v0, v10
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v6, v[8:9]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v14, vcc, v1, v11, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v8
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, v2, v12
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v4, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v10, v5, 0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, v3, v13, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v4, v[10:11]
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v7, 0
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v9, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v12, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v13, v6, v[10:11]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v6, vcc, 1, v0
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v1, vcc
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v2
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v3, vcc
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v0, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v8
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v6, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v6, v[0:1]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v9, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v8, v3, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v11, 0
-; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v4
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v10, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v10, v[0:1]
-; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v5, vcc
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v12, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v13, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v12, v[2:3]
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, 1, v2
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v13, v4, v[2:3]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v8
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v12, vcc, v0, v12
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v15, v6, v[8:9]
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, v1, v13, vcc
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v10
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v14, vcc, v2, v14
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v12, v5, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v15, vcc, v3, v15, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v12, v4, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v4, v[10:11]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v7, 0
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v9, v12
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v6, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v6, v[10:11]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v10, v5, v12
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v5, vcc, 1, v0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v11, vcc, 1, v2
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v3, vcc
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v13, vcc, 1, v8
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v9, vcc
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v5, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v5, v[0:1]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v12, 0
+; GFX90A-GISEL-NEXT:    v_add_co_u32_e32 v15, vcc, 1, v4
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v8, v3, v6
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v4, v11, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v10, v11, v[0:1]
+; GFX90A-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v10, vcc
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v9, v7, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v2, v14, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v13, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v8, v13, v[4:5]
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v16, 0
 ; GFX90A-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v14, 0
-; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v7, v14, v[4:5]
-; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v15, 0
+; GFX90A-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v9, v15, v[4:5]
+; GFX90A-GISEL-NEXT:    v_add_u32_e32 v3, v3, v6
 ; GFX90A-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v2i64:
diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index fa52b96e9ea95..02eda2c4822c2 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -6,40 +6,12 @@
 # No more registers shall be defined
 ---
 name:            main
-alignment:       1
-exposesReturnsTwice: false
-legalized:       false
-regBankSelected: false
-selected:        false
 tracksRegLiveness: true
 registers:
-  - { id: 1, class: sreg_32_xm0, preferred-register: '%1' }
-  - { id: 2, class: vreg_64, preferred-register: '%2' }
-  - { id: 3, class: vreg_64 }
-  - { id: 4, class: vreg_64 }
-  - { id: 5, class: vreg_64 }
-  - { id: 6, class: vreg_96 }
-  - { id: 7, class: vreg_96 }
-  - { id: 8, class: vreg_128 }
-  - { id: 9, class: vreg_128 }
-liveins:
-  - { reg: '$sgpr6', virtual-reg: '%1' }
-frameInfo:
-  isFrameAddressTaken: false
-  isReturnAddressTaken: false
-  hasStackMap:     false
-  hasPatchPoint:   false
-  stackSize:       0
-  offsetAdjustment: 0
-  maxAlignment:    0
-  adjustsStack:    false
-  hasCalls:        false
-  maxCallFrameSize: 0
-  hasOpaqueSPAdjustment: false
-  hasVAStart:      false
-  hasMustTailInVarArgFunc: false
+  - { id: 0, class: sreg_32_xm0, preferred-register: '%0' }
+  - { id: 1, class: vreg_64, preferred-register: '%1' }
 body:             |
-  bb.0.entry:
+  bb.0:
     liveins: $sgpr0, $vgpr0_vgpr1
 
     ; CHECK-LABEL: name: main
@@ -59,20 +31,21 @@ body:             |
     ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0
     ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr
-    %3 = IMPLICIT_DEF
-    undef %4.sub0 = COPY $sgpr0
-    %4.sub1 = COPY %3.sub0
-    undef %5.sub0 = COPY %4.sub1
-    %5.sub1 = COPY %4.sub0
-    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %5, 0, 0, implicit $exec, implicit $flat_scr
+    %2:vreg_64 = IMPLICIT_DEF
+    undef %3.sub0:vreg_64 = COPY $sgpr0
+    %3.sub1:vreg_64 = COPY %2.sub0
+    undef %4.sub0:vreg_64 = COPY %3.sub1
+    %4.sub1:vreg_64 = COPY %3.sub0
+    FLAT_STORE_DWORDX2 $vgpr0_vgpr1, killed %4, 0, 0, implicit $exec, implicit $flat_scr
 
-    %6 = IMPLICIT_DEF
-    undef %7.sub0_sub1 = COPY %6
-    %7.sub2 = COPY %3.sub0
-    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %7, 0, 0, implicit $exec, implicit $flat_scr
+    %5:vreg_96 = IMPLICIT_DEF
+    undef %6.sub0_sub1:vreg_96 = COPY %5
+    %6.sub2:vreg_96 = COPY %2.sub0
+    FLAT_STORE_DWORDX3 $vgpr0_vgpr1, killed %6, 0, 0, implicit $exec, implicit $flat_scr
+
+    %7:vreg_128 = IMPLICIT_DEF
+    undef %8.sub0_sub1_sub2:vreg_128 = COPY %7
+    %8.sub3:vreg_128 = COPY %2.sub0
+    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %8, 0, 0, implicit $exec, implicit $flat_scr
 
-    %8 = IMPLICIT_DEF
-    undef %9.sub0_sub1_sub2 = COPY %8
-    %9.sub3 = COPY %3.sub0
-    FLAT_STORE_DWORDX4 $vgpr0_vgpr1, killed %9, 0, 0, implicit $exec, implicit $flat_scr
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
index 1ab4cb0f00192..d82d6bcb437cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w32.ll
@@ -781,16 +781,23 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
 ; GISEL12-NEXT:    v_dual_mov_b32 v14, v38 :: v_dual_mov_b32 v15, v39
 ; GISEL12-NEXT:    s_wait_kmcnt 0x0
 ; GISEL12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GISEL12-NEXT:    v_dual_mov_b32 v24, v0 :: v_dual_mov_b32 v25, v1
-; GISEL12-NEXT:    v_dual_mov_b32 v26, v2 :: v_dual_mov_b32 v27, v3
-; GISEL12-NEXT:    v_dual_mov_b32 v28, v4 :: v_dual_mov_b32 v29, v5
-; GISEL12-NEXT:    v_dual_mov_b32 v30, v6 :: v_dual_mov_b32 v31, v7
-; GISEL12-NEXT:    v_dual_mov_b32 v32, v8 :: v_dual_mov_b32 v33, v9
-; GISEL12-NEXT:    v_dual_mov_b32 v34, v10 :: v_dual_mov_b32 v35, v11
-; GISEL12-NEXT:    v_dual_mov_b32 v36, v12 :: v_dual_mov_b32 v37, v13
-; GISEL12-NEXT:    v_dual_mov_b32 v38, v14 :: v_dual_mov_b32 v39, v15
+; GISEL12-NEXT:    v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GISEL12-NEXT:    v_dual_mov_b32 v42, v2 :: v_dual_mov_b32 v43, v3
+; GISEL12-NEXT:    v_dual_mov_b32 v44, v4 :: v_dual_mov_b32 v45, v5
+; GISEL12-NEXT:    v_dual_mov_b32 v46, v6 :: v_dual_mov_b32 v47, v7
+; GISEL12-NEXT:    v_dual_mov_b32 v48, v8 :: v_dual_mov_b32 v49, v9
+; GISEL12-NEXT:    v_dual_mov_b32 v50, v10 :: v_dual_mov_b32 v51, v11
+; GISEL12-NEXT:    v_dual_mov_b32 v52, v12 :: v_dual_mov_b32 v53, v13
+; GISEL12-NEXT:    v_dual_mov_b32 v54, v14 :: v_dual_mov_b32 v55, v15
 ; GISEL12-NEXT:    s_mov_b32 exec_lo, s9
-; GISEL12-NEXT:    ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL12-NEXT:    v_dual_mov_b32 v24, v40 :: v_dual_mov_b32 v25, v41
+; GISEL12-NEXT:    v_dual_mov_b32 v26, v42 :: v_dual_mov_b32 v27, v43
+; GISEL12-NEXT:    v_dual_mov_b32 v28, v44 :: v_dual_mov_b32 v29, v45
+; GISEL12-NEXT:    v_dual_mov_b32 v30, v46 :: v_dual_mov_b32 v31, v47
+; GISEL12-NEXT:    v_dual_mov_b32 v32, v48 :: v_dual_mov_b32 v33, v49
+; GISEL12-NEXT:    v_dual_mov_b32 v34, v50 :: v_dual_mov_b32 v35, v51
+; GISEL12-NEXT:    v_dual_mov_b32 v36, v52 :: v_dual_mov_b32 v37, v53
+; GISEL12-NEXT:    v_dual_mov_b32 v38, v54 :: v_dual_mov_b32 v39, v55
 ; GISEL12-NEXT:  .LBB5_2: ; %tail
 ; GISEL12-NEXT:    s_wait_alu 0xfffe
 ; GISEL12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
@@ -946,24 +953,39 @@ define amdgpu_cs_chain void @wwm_write_to_arg_reg(<3 x i32> inreg %sgpr, ptr inr
 ; GISEL10-NEXT:    s_mov_b64 s[2:3], s[50:51]
 ; GISEL10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL10-NEXT:    s_swappc_b64 s[30:31], s[12:13]
-; GISEL10-NEXT:    v_mov_b32_e32 v24, v0
-; GISEL10-NEXT:    v_mov_b32_e32 v25, v1
-; GISEL10-NEXT:    v_mov_b32_e32 v26, v2
-; GISEL10-NEXT:    v_mov_b32_e32 v27, v3
-; GISEL10-NEXT:    v_mov_b32_e32 v28, v4
-; GISEL10-NEXT:    v_mov_b32_e32 v29, v5
-; GISEL10-NEXT:    v_mov_b32_e32 v30, v6
-; GISEL10-NEXT:    v_mov_b32_e32 v31, v7
-; GISEL10-NEXT:    v_mov_b32_e32 v32, v8
-; GISEL10-NEXT:    v_mov_b32_e32 v33, v9
-; GISEL10-NEXT:    v_mov_b32_e32 v34, v10
-; GISEL10-NEXT:    v_mov_b32_e32 v35, v11
-; GISEL10-NEXT:    v_mov_b32_e32 v36, v12
-; GISEL10-NEXT:    v_mov_b32_e32 v37, v13
-; GISEL10-NEXT:    v_mov_b32_e32 v38, v14
-; GISEL10-NEXT:    v_mov_b32_e32 v39, v15
+; GISEL10-NEXT:    v_mov_b32_e32 v40, v0
+; GISEL10-NEXT:    v_mov_b32_e32 v41, v1
+; GISEL10-NEXT:    v_mov_b32_e32 v42, v2
+; GISEL10-NEXT:    v_mov_b32_e32 v43, v3
+; GISEL10-NEXT:    v_mov_b32_e32 v44, v4
+; GISEL10-NEXT:    v_mov_b32_e32 v45, v5
+; GISEL10-NEXT:    v_mov_b32_e32 v46, v6
+; GISEL10-NEXT:    v_mov_b32_e32 v47, v7
+; GISEL10-NEXT:    v_mov_b32_e32 v48, v8
+; GISEL10-NEXT:    v_mov_b32_e32 v49, v9
+; GISEL10-NEXT:    v_mov_b32_e32 v50, v10
+; GISEL10-NEXT:    v_mov_b32_e32 v51, v11
+; GISEL10-NEXT:    v_mov_b32_e32 v52, v12
+; GISEL10-NEXT:    v_mov_b32_e32 v53, v13
+; GISEL10-NEXT:    v_mov_b32_e32 v54, v14
+; GISEL10-NEXT:    v_mov_b32_e32 v55, v15
 ; GISEL10-NEXT:    s_mov_b32 exec_lo, s9
-; GISEL10-NEXT:    ; kill: def $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 killed $exec
+; GISEL10-NEXT:    v_mov_b32_e32 v24, v40
+; GISEL10-NEXT:    v_mov_b32_e32 v25, v41
+; GISEL10-NEXT:    v_mov_b32_e32 v26, v42
+; GISEL10-NEXT:    v_mov_b32_e32 v27, v43
+; GISEL10-NEXT:    v_mov_b32_e32 v28, v44
+; GISEL10-NEXT:    v_mov_b32_e32 v29, v45
+; GISEL10-NEXT:    v_mov_b32_e32 v30, v46
+; GISEL10-NEXT:    v_mov_b32_e32 v31, v47
+; GISEL10-NEXT:    v_mov_b32_e32 v32, v48
+; GISEL10-NEXT:    v_mov_b32_e32 v33, v49
+; GISEL10-NEXT:    v_mov_b32_e32 v34, v50
+; GISEL10-NEXT:    v_mov_b32_e32 v35, v51
+; GISEL10-NEXT:    v_mov_b32_e32 v36, v52
+; GISEL10-NEXT:    v_mov_b32_e32 v37, v53
+; GISEL10-NEXT:    v_mov_b32_e32 v38, v54
+; GISEL10-NEXT:    v_mov_b32_e32 v39, v55
 ; GISEL10-NEXT:  .LBB5_2: ; %tail
 ; GISEL10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GISEL10-NEXT:    v_mov_b32_e32 v8, v24
diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll
new file mode 100644
index 0000000000000..8eefc9dfc5d7e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/private-function.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define private void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  ret void
+}
+
+@var = global ptr @foo
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
deleted file mode 100644
index 05a0e39d4a715..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-to-vector.ll
+++ /dev/null
@@ -1,325 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=amdgpu-promote-alloca < %s | FileCheck -check-prefix=OPT %s
-
-define amdgpu_kernel void @large_array_vectors_small_users(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
-; OPT-LABEL: define amdgpu_kernel void @large_array_vectors_small_users(
-; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
-; OPT-NEXT:  [[ENTRY:.*:]]
-; OPT-NEXT:    [[ALLOCA:%.*]] = freeze <128 x i8> poison
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = insertelement <128 x i8> [[ALLOCA]], i8 [[TMP0]], i32 0
-; OPT-NEXT:    [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
-; OPT-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
-; OPT-NEXT:    [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
-; OPT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
-; OPT-NEXT:    [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
-; OPT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
-; OPT-NEXT:    [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
-; OPT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
-; OPT-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
-; OPT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
-; OPT-NEXT:    [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
-; OPT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
-; OPT-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
-; OPT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
-; OPT-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
-; OPT-NEXT:    [[TMP32:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP33:%.*]] = insertelement <128 x i8> [[TMP31]], i8 [[TMP32]], i32 0
-; OPT-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 1
-; OPT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 2
-; OPT-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 3
-; OPT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 4
-; OPT-NEXT:    [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 5
-; OPT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 6
-; OPT-NEXT:    [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 7
-; OPT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 8
-; OPT-NEXT:    [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 9
-; OPT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 10
-; OPT-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 11
-; OPT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 12
-; OPT-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 13
-; OPT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 14
-; OPT-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 15
-; OPT-NEXT:    [[TMP64:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP65:%.*]] = insertelement <128 x i8> [[TMP63]], i8 [[TMP64]], i32 0
-; OPT-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 1
-; OPT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 2
-; OPT-NEXT:    [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 3
-; OPT-NEXT:    [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 4
-; OPT-NEXT:    [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 5
-; OPT-NEXT:    [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 6
-; OPT-NEXT:    [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 7
-; OPT-NEXT:    [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 8
-; OPT-NEXT:    [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 9
-; OPT-NEXT:    [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 10
-; OPT-NEXT:    [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 11
-; OPT-NEXT:    [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 12
-; OPT-NEXT:    [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 13
-; OPT-NEXT:    [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 14
-; OPT-NEXT:    [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 15
-; OPT-NEXT:    [[TMP96:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP97:%.*]] = insertelement <128 x i8> [[TMP95]], i8 [[TMP96]], i32 0
-; OPT-NEXT:    [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 1
-; OPT-NEXT:    [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 2
-; OPT-NEXT:    [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 3
-; OPT-NEXT:    [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 4
-; OPT-NEXT:    [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 5
-; OPT-NEXT:    [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 6
-; OPT-NEXT:    [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 7
-; OPT-NEXT:    [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 8
-; OPT-NEXT:    [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 9
-; OPT-NEXT:    [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 10
-; OPT-NEXT:    [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 11
-; OPT-NEXT:    [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 12
-; OPT-NEXT:    [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 13
-; OPT-NEXT:    [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 14
-; OPT-NEXT:    [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 15
-; OPT-NEXT:    [[TMP128:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP129:%.*]] = insertelement <128 x i8> [[TMP127]], i8 [[TMP128]], i32 0
-; OPT-NEXT:    [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 1
-; OPT-NEXT:    [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 2
-; OPT-NEXT:    [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 3
-; OPT-NEXT:    [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 4
-; OPT-NEXT:    [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 5
-; OPT-NEXT:    [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 6
-; OPT-NEXT:    [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 7
-; OPT-NEXT:    [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 8
-; OPT-NEXT:    [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 9
-; OPT-NEXT:    [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 10
-; OPT-NEXT:    [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 11
-; OPT-NEXT:    [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 12
-; OPT-NEXT:    [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 13
-; OPT-NEXT:    [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 14
-; OPT-NEXT:    [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 15
-; OPT-NEXT:    [[TMP160:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP161:%.*]] = insertelement <128 x i8> [[TMP159]], i8 [[TMP160]], i32 0
-; OPT-NEXT:    [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 1
-; OPT-NEXT:    [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 2
-; OPT-NEXT:    [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 3
-; OPT-NEXT:    [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 4
-; OPT-NEXT:    [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 5
-; OPT-NEXT:    [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 6
-; OPT-NEXT:    [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 7
-; OPT-NEXT:    [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 8
-; OPT-NEXT:    [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 9
-; OPT-NEXT:    [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 10
-; OPT-NEXT:    [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 11
-; OPT-NEXT:    [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 12
-; OPT-NEXT:    [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 13
-; OPT-NEXT:    [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 14
-; OPT-NEXT:    [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 15
-; OPT-NEXT:    [[TMP192:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP193:%.*]] = insertelement <128 x i8> [[TMP191]], i8 [[TMP192]], i32 0
-; OPT-NEXT:    [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 1
-; OPT-NEXT:    [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 2
-; OPT-NEXT:    [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 3
-; OPT-NEXT:    [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 4
-; OPT-NEXT:    [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 5
-; OPT-NEXT:    [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 6
-; OPT-NEXT:    [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 7
-; OPT-NEXT:    [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 8
-; OPT-NEXT:    [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 9
-; OPT-NEXT:    [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 10
-; OPT-NEXT:    [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 11
-; OPT-NEXT:    [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 12
-; OPT-NEXT:    [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 13
-; OPT-NEXT:    [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 14
-; OPT-NEXT:    [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 15
-; OPT-NEXT:    [[TMP224:%.*]] = extractelement <16 x i8> [[IN]], i64 0
-; OPT-NEXT:    [[TMP225:%.*]] = insertelement <128 x i8> [[TMP223]], i8 [[TMP224]], i32 0
-; OPT-NEXT:    [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
-; OPT-NEXT:    [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 1
-; OPT-NEXT:    [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
-; OPT-NEXT:    [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 2
-; OPT-NEXT:    [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
-; OPT-NEXT:    [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 3
-; OPT-NEXT:    [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
-; OPT-NEXT:    [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 4
-; OPT-NEXT:    [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
-; OPT-NEXT:    [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 5
-; OPT-NEXT:    [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
-; OPT-NEXT:    [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 6
-; OPT-NEXT:    [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
-; OPT-NEXT:    [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 7
-; OPT-NEXT:    [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
-; OPT-NEXT:    [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 8
-; OPT-NEXT:    [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
-; OPT-NEXT:    [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 9
-; OPT-NEXT:    [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
-; OPT-NEXT:    [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 10
-; OPT-NEXT:    [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
-; OPT-NEXT:    [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 11
-; OPT-NEXT:    [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
-; OPT-NEXT:    [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 12
-; OPT-NEXT:    [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
-; OPT-NEXT:    [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 13
-; OPT-NEXT:    [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
-; OPT-NEXT:    [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 14
-; OPT-NEXT:    [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
-; OPT-NEXT:    [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 15
-; OPT-NEXT:    [[TMP256:%.*]] = extractelement <128 x i8> [[TMP255]], i32 80
-; OPT-NEXT:    [[TMP257:%.*]] = insertelement <16 x i8> poison, i8 [[TMP256]], i64 0
-; OPT-NEXT:    [[TMP258:%.*]] = extractelement <128 x i8> [[TMP255]], i32 81
-; OPT-NEXT:    [[TMP259:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP258]], i64 1
-; OPT-NEXT:    [[TMP260:%.*]] = extractelement <128 x i8> [[TMP255]], i32 82
-; OPT-NEXT:    [[TMP261:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP260]], i64 2
-; OPT-NEXT:    [[TMP262:%.*]] = extractelement <128 x i8> [[TMP255]], i32 83
-; OPT-NEXT:    [[TMP263:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP262]], i64 3
-; OPT-NEXT:    [[TMP264:%.*]] = extractelement <128 x i8> [[TMP255]], i32 84
-; OPT-NEXT:    [[TMP265:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP264]], i64 4
-; OPT-NEXT:    [[TMP266:%.*]] = extractelement <128 x i8> [[TMP255]], i32 85
-; OPT-NEXT:    [[TMP267:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP266]], i64 5
-; OPT-NEXT:    [[TMP268:%.*]] = extractelement <128 x i8> [[TMP255]], i32 86
-; OPT-NEXT:    [[TMP269:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP268]], i64 6
-; OPT-NEXT:    [[TMP270:%.*]] = extractelement <128 x i8> [[TMP255]], i32 87
-; OPT-NEXT:    [[TMP271:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP270]], i64 7
-; OPT-NEXT:    [[TMP272:%.*]] = extractelement <128 x i8> [[TMP255]], i32 88
-; OPT-NEXT:    [[TMP273:%.*]] = insertelement <16 x i8> [[TMP271]], i8 [[TMP272]], i64 8
-; OPT-NEXT:    [[TMP274:%.*]] = extractelement <128 x i8> [[TMP255]], i32 89
-; OPT-NEXT:    [[TMP275:%.*]] = insertelement <16 x i8> [[TMP273]], i8 [[TMP274]], i64 9
-; OPT-NEXT:    [[TMP276:%.*]] = extractelement <128 x i8> [[TMP255]], i32 90
-; OPT-NEXT:    [[TMP277:%.*]] = insertelement <16 x i8> [[TMP275]], i8 [[TMP276]], i64 10
-; OPT-NEXT:    [[TMP278:%.*]] = extractelement <128 x i8> [[TMP255]], i32 91
-; OPT-NEXT:    [[TMP279:%.*]] = insertelement <16 x i8> [[TMP277]], i8 [[TMP278]], i64 11
-; OPT-NEXT:    [[TMP280:%.*]] = extractelement <128 x i8> [[TMP255]], i32 92
-; OPT-NEXT:    [[TMP281:%.*]] = insertelement <16 x i8> [[TMP279]], i8 [[TMP280]], i64 12
-; OPT-NEXT:    [[TMP282:%.*]] = extractelement <128 x i8> [[TMP255]], i32 93
-; OPT-NEXT:    [[TMP283:%.*]] = insertelement <16 x i8> [[TMP281]], i8 [[TMP282]], i64 13
-; OPT-NEXT:    [[TMP284:%.*]] = extractelement <128 x i8> [[TMP255]], i32 94
-; OPT-NEXT:    [[TMP285:%.*]] = insertelement <16 x i8> [[TMP283]], i8 [[TMP284]], i64 14
-; OPT-NEXT:    [[TMP286:%.*]] = extractelement <128 x i8> [[TMP255]], i32 95
-; OPT-NEXT:    [[TMP287:%.*]] = insertelement <16 x i8> [[TMP285]], i8 [[TMP286]], i64 15
-; OPT-NEXT:    [[SUM:%.*]] = add <16 x i8> [[TMP287]], [[ADD]]
-; OPT-NEXT:    store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
-; OPT-NEXT:    ret void
-;
-entry:
-  %alloca = alloca [8 x <16 x i8>], align 16, addrspace(5)
-  %gep0 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 0
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep1 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 1
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep2 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 2
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep3 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 3
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep4 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 4
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep5 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 5
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep6 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 6
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %gep7 = getelementptr [8 x <16 x i8>], ptr addrspace(5) %alloca, i64 0, i64 7
-  store <16 x i8> %in, ptr addrspace(5) %gep0, align 16
-  %load = load <16 x i8>, ptr addrspace(5) %gep5, align 16
-  %sum = add <16 x i8> %load, %add
-  store <16 x i8> %sum, ptr addrspace(3) %out, align 16
-  ret void
-}
-
-attributes #0 = {"amdgpu-waves-per-eu"="2,2"}
diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
new file mode 100644
index 0000000000000..381cb8c9d1047
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-subreg-liveness.mir
@@ -0,0 +1,131 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s
+
+# This test is to check fix for failure with "Bad machine code: Defining instruction does not modify register" due to corrupt lane mask.
+
+---
+name:            reg_coalescer_subreg_liveness
+tracksRegLiveness: true
+liveins:
+body:             |
+  ; CHECK-LABEL: name: reg_coalescer_subreg_liveness
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_1:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub2:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub3:sgpr_128 = COPY [[S_MOV_B32_]].sub0
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_MOV_B32_]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   TENSOR_LOAD_TO_LDS_D2 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_1]], 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+  ; CHECK-NEXT:   $vcc_lo = COPY $exec_lo
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc_lo
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sreg_32 = S_MOV_B32 1
+    undef %3.sub0:sgpr_128 = COPY %2
+    %4:sreg_32 = S_MOV_B32 0
+    undef %5.sub0:sgpr_256 = COPY %4
+    TENSOR_LOAD_TO_LDS_D2 %3, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    %6:sgpr_128 = COPY killed %3
+    %6.sub1:sgpr_128 = COPY killed %1
+    %7:sreg_32 = COPY $exec_lo
+    %8:sreg_32 = COPY %2
+    %9:sreg_32 = COPY %4
+
+  bb.1:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+    %10:sreg_32 = COPY killed %8
+    undef %11.sub0:sgpr_128 = COPY %2
+    %11.sub1:sgpr_128 = COPY killed %10
+    %11.sub2:sgpr_128 = COPY %2
+    %11.sub3:sgpr_128 = COPY %2
+    TENSOR_LOAD_TO_LDS_D2 killed %11, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    %12:sreg_32 = COPY killed %9
+    %13:sgpr_128 = COPY %6
+    %13.sub2:sgpr_128 = COPY killed %12
+    TENSOR_LOAD_TO_LDS_D2 killed %13, %5, 0, 0, implicit-def dead $tensorcnt, implicit $exec, implicit $tensorcnt
+    $vcc_lo = COPY %7
+    %8:sreg_32 = COPY %4
+    %9:sreg_32 = COPY %2
+    S_CBRANCH_VCCNZ %bb.1, implicit killed $vcc_lo, implicit $vcc_lo, implicit $vcc
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
+---
+name:            reg_coalescer_subreg_liveness_2
+tracksRegLiveness: true
+liveins:
+body:             |
+  ; CHECK-LABEL: name: reg_coalescer_subreg_liveness_2
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORD_IMM:%[0-9]+]].sub2:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub1:sgpr_128 = S_LOAD_DWORD_IMM [[COPY]], 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]].sub0:sgpr_128 = S_MOV_B32 1
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub0:sgpr_256 = S_MOV_B32 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_NOP 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_MOV_B32_]]
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64 = COPY killed $sgpr4_sgpr5
+    %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
+    %2:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM killed %0, 24, 0 :: (dereferenceable invariant load (s32), align 8, addrspace 4)
+    %3:sreg_32 = S_MOV_B32 1
+    undef %4.sub0:sgpr_128 = COPY %3
+    %5:sgpr_128 = COPY %4
+    %5.sub1:sgpr_128 = COPY killed %2
+    %6:sgpr_128 = COPY %5
+    %6.sub2:sgpr_128 = COPY killed %1
+    %7:sreg_32 = S_MOV_B32 0
+    undef %8.sub0:sgpr_256 = COPY %7
+    %9:sreg_32 = COPY %3
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %10:sreg_32 = COPY killed %9
+    undef %11.sub0:sgpr_128 = COPY %3
+    %11.sub1:sgpr_128 = COPY killed %10
+    S_NOP 0, implicit %5, implicit %8
+    S_BRANCH %bb.2
+
+  bb.2:
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
index 002d43f937837..131656975ec40 100644
--- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
+++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
 
 ---
@@ -40,6 +41,27 @@ body:             |
   S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
 ...
 
+---
+name:            meta_in_between
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: meta_in_between
+    ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: KILL $sgpr0
+    ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+  $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  KILL $sgpr0
+  $sgpr0 = IMPLICIT_DEF
+  S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+  $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+...
+
 ---
 name:            valu_write_in_between
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 34de1e48bfb59..01bcdad3fc220 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -3,15 +3,16 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
 
 define void @nested_inf_loop(i1 %0, i1 %1) {
-; OPT-LABEL: @nested_inf_loop(
-; OPT-NEXT:  BB:
-; OPT-NEXT:    br label [[BB1:%.*]]
-; OPT:       BB1:
-; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
-; OPT-NEXT:    br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
-; OPT:       infloop:
-; OPT-NEXT:    br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
-; OPT:       DummyReturnBlock:
+; OPT-LABEL: define void @nested_inf_loop(
+; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    br label %[[BB1:.*]]
+; OPT:       [[BB1]]:
+; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]]
+; OPT-NEXT:    br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]]
+; OPT:       [[INFLOOP]]:
+; OPT-NEXT:    br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[DUMMYRETURNBLOCK]]:
 ; OPT-NEXT:    ret void
 ;
 ; ISA-LABEL: nested_inf_loop:
@@ -63,3 +64,84 @@ BB4:
 BB3:
   br label %BB1
 }
+
+define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
+; OPT-LABEL: define void @nested_inf_loop_callbr(
+; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1:.*]] []
+; OPT:       [[BB1]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP0]])
+; OPT-NEXT:            to label %[[BB3:.*]] [label %BB2]
+; OPT:       [[BB2:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB4:.*]] []
+; OPT:       [[BB4]]:
+; OPT-NEXT:    br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[TRANSITIONBLOCK]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP1]])
+; OPT-NEXT:            to label %[[BB3]] [label %BB4]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1]] []
+; OPT:       [[DUMMYRETURNBLOCK]]:
+; OPT-NEXT:    ret void
+;
+; ISA-LABEL: nested_inf_loop_callbr:
+; ISA:       ; %bb.0: ; %BB
+; ISA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT:  .LBB1_1: ; %BB1
+; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[4:5], exec
+; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; ISA-NEXT:  .LBB1_2: ; %BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[6:7], exec
+; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; ISA-NEXT:    s_branch .LBB1_1
+; ISA-NEXT:  .LBB1_3: ; Inline asm indirect target
+; ISA-NEXT:    ; %BB2
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ; Label of block must be emitted
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_mov_b64 s[6:7], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; ISA-NEXT:    s_cbranch_execz .LBB1_5
+; ISA-NEXT:  ; %bb.4: ; %TransitionBlock.target.BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:  .LBB1_5: ; %loop.exit.guard
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; ISA-NEXT:    s_mov_b64 s[6:7], 0
+; ISA-NEXT:    s_cbranch_vccz .LBB1_2
+; ISA-NEXT:  ; %bb.6: ; %DummyReturnBlock
+; ISA-NEXT:    s_setpc_b64 s[30:31]
+BB:
+  callbr void asm "", ""() to label %BB1 []
+
+BB1:
+  callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2]
+
+BB2:
+  callbr void asm "", ""() to label %BB4 []
+
+BB4:
+  callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4]
+
+BB3:
+  callbr void asm "", ""() to label %BB1 []
+}
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 4cbe682cf9f9f..004c27971131d 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
 
 declare void @llvm.trap()
@@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; CHECK-NEXT:    s_mov_b64 s[2:3], -1
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    s_branch .LBB0_4
-
-
+; UNIFY-LABEL: @kernel(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    br label [[IF_END6]]
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = icmp eq i32 %n, 256
@@ -105,5 +130,129 @@ if.end6.sink.split:
 if.end6:
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; UNIFY: {{.*}}
+
+define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
+; CHECK-LABEL: kernel_callbr:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dword s1, s[8:9], 0x10
+; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_cmpk_eq_i32 s1, 0x100
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %if.then
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_2: ; %if.end6.sink.split
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_dword v0, v1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_3: ; Inline asm indirect target
+; CHECK-NEXT:    ; %UnifiedReturnBlock
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .LBB1_4: ; Inline asm indirect target
+; CHECK-NEXT:    ; %if.else
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 10, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.5: ; %if.then3
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_2
+; CHECK-NEXT:  .LBB1_6: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false.i8
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:  .LBB1_7: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_trap 2
+; CHECK-NEXT:    ; divergent unreachable
+; CHECK-NEXT:    s_branch .LBB1_3
+; UNIFY-LABEL: @kernel_callbr(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    [[CMP32:%.*]] = zext i1 [[CMP]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP32]])
+; UNIFY-NEXT:            to label [[IF_THEN:%.*]] [label %if.else]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP2_32]])
+; UNIFY-NEXT:            to label [[IF_THEN3:%.*]] [label %if.end6]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    callbr void asm "", ""()
+; UNIFY-NEXT:            to label [[IF_END6:%.*]] []
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cmp = icmp eq i32 %n, 256
+  %cmp32 = zext i1 %cmp to i32
+  callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else]
+
+if.then:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp1_32 = zext i1 %cmp1 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false]
+
+cond.false:
+  call void @llvm.trap()
+  unreachable
+
+if.else:
+  %cmp2 = icmp ult i32 %tid, 10
+  %cmp2_32 = zext i1 %cmp2 to i32
+  callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6]
+
+if.then3:
+  %cmp1.i7 = icmp eq i32 %a, 0
+  %cmp1.i7_32 = zext i1 %cmp1.i7 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8]
+
+cond.false.i8:
+  call void @llvm.trap()
+  unreachable
+
+if.end6.sink.split:
+  %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid
+  store i32 %a, ptr addrspace(1) %x1, align 4
+  callbr void asm "", ""() to label %if.end6 []
+
+if.end6:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll
index 50666bee325e8..684dc1a1f0092 100644
--- a/llvm/test/CodeGen/AMDGPU/update-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -37,3 +37,42 @@ n28:                                               ; preds = %.loopexit, %n28
 n31:                                               ; preds =
   ret void
 }
+
+define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 {
+; IR-LABEL: @_amdgpu_ps_main_callbr(
+; IR-NEXT:  .entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[DOTLOOPEXIT:%.*]] []
+; IR:       .loopexit:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[N28:%.*]] []
+; IR:       n28:
+; IR-NEXT:    [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
+; IR-NEXT:    [[N29]] = fadd float [[DOT01]], 1.000000e+00
+; IR-NEXT:    [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
+; IR-NEXT:    [[N30_32:%.*]] = zext i1 [[N30]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[N30_32]])
+; IR-NEXT:            to label [[DOTLOOPEXIT]] [label %n28]
+; IR:       n31:
+; IR-NEXT:    ret void
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+.entry:
+  callbr void asm "", ""() to label %.loopexit []
+
+.loopexit:                                        ; preds = %n28, %.entry
+  callbr void asm "", ""() to label %n28 []
+
+n28:                                               ; preds = %.loopexit, %n28
+  %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ]
+  %n29 = fadd float %.01, 1.0
+  %n30 = fcmp ogt float %n29, 4.000000e+00
+  %n30.32 = zext i1 %n30 to i32
+  callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28]
+
+n31:                                               ; preds =
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 4d5ade4abcef7..1b4ed67eb6eea 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -2481,10 +2481,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v1
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2502,10 +2503,11 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v1
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v3, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, v1
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2524,8 +2526,8 @@ define i64 @test_vector_reduce_mul_v2i64(<2 x i64> %v) {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v4, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v4, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v5, v2, v[6:7]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v5, v2, v[6:7]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v8
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v2i64:
@@ -2626,9 +2628,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2654,9 +2656,9 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v3, v[1:2]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v4, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v2, v[8:9]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v7, v5, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v4, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v2, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v5, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v10, v4, v[8:9]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2677,12 +2679,12 @@ define i64 @test_vector_reduce_mul_v3i64(<3 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v0, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v2, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v1, v2, v[8:9]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v8, v7, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v6, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v7, v4, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v8, v4, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i64:
@@ -2816,10 +2818,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2853,10 +2855,10 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v12
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[15:16], s[4:5], v2, v7, v[0:1]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v11, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v6, v[15:16]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v4, v[13:14]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v9, v2, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v11, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v3, v6, v[15:16]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v4, v[13:14]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v9, v17, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v11, v[3:4]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -2881,16 +2883,16 @@ define i64 @test_vector_reduce_mul_v4i64(<4 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v0, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v4, v[10:11]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v9, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v1, v4, v[10:11]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v7, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, v6, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v3, v6, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, v5, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v8, v2, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v3, v6, v[0:1]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v9, v12
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v8, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v8, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v9, v4, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v9, v4, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v4i64:
@@ -3068,31 +3070,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v13
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v13, v20
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v5, v21
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v20
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v7
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, v8
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3139,31 +3139,29 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v4, v12, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v0, v8, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v13, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v5, v12, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v19, v16, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v13
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v13, v20
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[13:14]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v19, v4, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v0, v8, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v4, v13, v[17:18]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[4:5], v5, v12, v[18:19]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v20, v16, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v20, v22, v[5:6]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v5, v21
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[19:20], s[4:5], v6, v14, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v8, v[17:18]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v0, v9, v[5:6]
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v20
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v15, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v6, v18
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v2, v11, v[6:7]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v14, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v17, v19, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v3, v10, v[20:21]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v7
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v6, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v19, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v16, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v2, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v6, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[21:22], s[4:5], v1, v8, v[17:18]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v6, v15, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v10, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v2, v11, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v7, v14, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v0, v19, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v3, v10, v[5:6]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v0, v1, v[2:3]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v7, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v14, v19, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v21, v16, v[12:13]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v2, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v7, v[8:9]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3204,32 +3202,32 @@ define i64 @test_vector_reduce_mul_v8i64(<8 x i64> %v) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v0, v9, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v8, v[18:19]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[0:1], v1, v8, v[18:19]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v2, v11, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v17, v17, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, v10, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v3, v10, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v13, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v5, v12, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v3, v4
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v5, v12, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v6, v15, 0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v18, v1, v18
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v3, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v14, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v7, v14, v[8:9]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v17, v17, v20
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v16, v1, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v6
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v17, v2, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v0, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v16, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v17, v2, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v2
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v4, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v5, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v10, v4, v[0:1]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v18, v4, v[8:9]
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v3, v0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v7, v10
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v7, v2, v[4:5]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, v2, v[4:5]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v6
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v8i64:
@@ -3550,63 +3548,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX7-GISEL:       ; %bb.0: ; %entry
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v9, v33
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v24, v25
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v8, v9
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX7-GISEL-NEXT:    buffer_load_dword v9, off, s[0:3], s32
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v17
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX7-GISEL-NEXT:    buffer_load_dword v27, off, s[0:3], s32
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v32
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
 ; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, v18
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v4, v13
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
 ; GFX7-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v10
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX7-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX7-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
 ; GFX7-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3695,63 +3693,63 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX8-GISEL:       ; %bb.0: ; %entry
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v8, v24, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v25, v[32:33]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v9, v24, v[32:33]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v16, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v24, v31, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v9, v33
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v24, v8, v[9:10]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v24, v25
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v0, v17, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v1, v16, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v2, v18, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v31, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v8, v25, v[32:33]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[35:36], s[4:5], v9, v24, v[33:34]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v16, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v8, v31, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[4:5], v8, v35, v[25:26]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v8, v9
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[4:5], v0, v17, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v16, v[34:35]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v10, v26, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v27, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v11, v26, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v16, v0, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v11
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[4:5], v16, v9, v[1:2]
-; GFX8-GISEL-NEXT:    buffer_load_dword v9, off, s[0:3], s32
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v17
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, v19, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v8, v31, v[32:33]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v10, v27, v[1:2]
+; GFX8-GISEL-NEXT:    buffer_load_dword v27, off, s[0:3], s32
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[31:32], s[4:5], v2, v18, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[33:34], s[4:5], v11, v26, v[8:9]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v31, v0, 0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v9
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v31, v33, v[1:2]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v32
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[25:26], s[4:5], v2, v19, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v18, v[25:26]
 ; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v12, v28, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[4:5], v1, v0, v[24:25]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v4, v20, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v1, v0, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v20, 0
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v12, v29, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v17, v2, 0
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v18
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v12, v29, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v9, v2, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v13, v28, v[18:19]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v10
 ; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v12
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v14, v30, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v5, v20, v[3:4]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v17, v0, v[1:2]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v4, v13
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[17:18], s[4:5], v6, v22, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v3, v2, v[0:1]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v3, v18
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v21, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v14, v30, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[4:5], v9, v0, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v20, v[12:13]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v4
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v22, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[20:21], s[4:5], v0, v2, v[18:19]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v4, v3, 0
 ; GFX8-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v14, v9, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v17, v12, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v15, v30, v[4:5]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v23, v[3:4]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, v14
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v17, v1, v[2:3]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v7, v22, v[3:4]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v32, v11, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v3, v12, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v10, v13, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v32, v0, v[5:6]
-; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v10, v1, v[0:1]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v2, 0
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v16, v13, v[9:10]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v8, v11, v[5:6]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v4, v9, v[1:2]
-; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[3:4]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v14, v27, v[1:2]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v2, v5
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v15, v30, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v23, v[2:3]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v1, v13
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v0, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v7, v22, v[9:10]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v24, v11, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v0, v3, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v12, 0
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v10
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v24, v20, v[0:1]
+; GFX8-GISEL-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v8, v1, v[0:1]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v9, v3, 0
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v17, v12, v[13:14]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v11, v[5:6]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v9, v7, v[1:2]
+; GFX8-GISEL-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v12, v3, v[4:5]
 ; GFX8-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v16i64:
@@ -3827,65 +3825,65 @@ define i64 @test_vector_reduce_mul_v16i64(<16 x i64> %v) {
 ; GFX9-GISEL-NEXT:    scratch_load_dword v31, off, s32
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[0:1], v0, v17, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[32:33], s[0:1], v0, v16, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, v16, v[34:35]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[36:37], s[0:1], v1, v16, v[34:35]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v2, v19, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v33, v33, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v3, v18, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[34:35], s[0:1], v3, v18, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v4, v21, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v18, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v18, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v20, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v5, v20, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v5, v20, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v6, v23, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v19, v3, v4
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v20, v3, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v22, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v7, v22, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v7, v22, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v8, v25, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v20, v5, v6
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v21, v5, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v8, v24, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v9, v24, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v9, v24, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v10, v27, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v7, v8
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v34, v1, v34
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v7, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v10, v26, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v11, v26, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v11, v26, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v12, v29, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v9, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v9, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v12, v28, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v13, v28, v[16:17]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v11, v12
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v13, v28, v[16:17]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v33, v33, v36
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v11, v18
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v14, v30, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v14, v31, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v15, v30, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v15, v30, v[16:17]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v32, v1, 0
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v9, v13, v14
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v11, v13, v18
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v33, v6, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v0, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v32, v6, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v33, v6, v[16:17]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v11, v15, v6
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v0, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v3, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v18, v8, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v7, v7, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v34, v8, v[16:17]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v2, v5, 0
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v13, v15, v18
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v15, v7, v0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v2, v10, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v5, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v19, v10, v[2:3]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v20, v10, v[8:9]
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v4, v11, 0
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v12, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v4, v9, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v20, v12, v[4:5]
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v21, v12, v[8:9]
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v14, v0, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v11, v0, v[8:9]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v5, v0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v13, v0, v[8:9]
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v6, v3, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v6, v2, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v7, v2, v[0:1]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v2, v9, v0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v2, 0
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v15, v2, v[0:1]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v10, v5, v10
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v5, v9, v6
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
 ; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v4, v8, 0
-; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v5, v8, v[2:3]
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v10, v8, v[2:3]
+; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, v1, v4
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v16i64:
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index a1381ecad81e2..f964480dcc633 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -1069,6 +1069,51 @@ body: |
     $sgpr0 = S_MOV_B32 $sgpr0
 ...
 
+# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
+---
+name: mixed_pending_events
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+body: |
+  ; GCN-LABEL: name: mixed_pending_events
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+  ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT:   liveins: $vgpr0_vgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT:   $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   liveins: $sgpr2, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   S_WAIT_LOADCNT 1
+  ; GCN-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+  ; GCN-NEXT:   S_WAIT_KMCNT 0
+  ; GCN-NEXT:   $sgpr2 = S_MOV_B32 $sgpr2
+  ; GCN-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+  bb.0:
+    liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+  bb.1:
+    liveins: $vgpr0_vgpr1, $sgpr2
+    $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+    $vgpr3 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+  bb.2:
+    liveins: $sgpr2, $vgpr2
+    $vgpr2 = V_MOV_B32_e32 $vgpr2, implicit $exec
+    $sgpr2 = S_MOV_B32 $sgpr2
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+...
+
 ---
 name: pending_vmem_event_between_block
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
new file mode 100644
index 0000000000000..df3e780c61f46
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s
+
+---
+name:            wmma_test
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, [[DEF]], 8, [[DEF1]], 8, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[DEF2]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_256 = COPY %3.sub1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF [[SI_IF]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %0:vreg_128 = IMPLICIT_DEF
+    %1:vreg_128 = IMPLICIT_DEF
+    %2:sreg_32 = IMPLICIT_DEF
+    early-clobber %3:vreg_256 = V_WMMA_F32_16X16X16_F16_w32_threeaddr 8, %0:vreg_128, 8, %1:vreg_128, 8, 0, 0, 0, implicit $exec
+    %4:sreg_32 = SI_IF %2:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    %5:vreg_256 = COPY %3.sub1:vreg_256
+
+  bb.2:
+    SI_END_CF %4:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll
index 9628405df6bcb..1448fac8d864f 100644
--- a/llvm/test/CodeGen/ARM/llvm.sincos.ll
+++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll
@@ -1,223 +1,1004 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefix=GNU %s
+; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s
+; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s
+; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s
+; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefix=WATCHABI %s
 
 define { half, half } @test_sincos_f16(half %a) {
-; CHECK-LABEL: test_sincos_f16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #4]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    ldr r0, [sp]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_f16:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #4]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    ldr r0, [sp]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    mov r1, r0
+; GNU-NEXT:    mov r0, r4
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    ldr r0, [sp]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    mov r1, r0
+; GNUEABI-NEXT:    mov r0, r4
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    mov r1, r0
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldm sp, {r0, r4}
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    mov r5, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r5
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
+;
+; WATCHABI-LABEL: test_sincos_f16:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s1, s1
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   ret { half, half } %result
 }
 
 define half @test_sincos_f16_only_use_sin(half %a) {
-; CHECK-LABEL: test_sincos_f16_only_use_sin:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #4]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f16_only_use_sin:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #4]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16_only_use_sin:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_sin:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    pop {lr}
+; IOS-NO-STRET-NEXT:    bx lr
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_sin:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f16_only_use_sin:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.0 = extractvalue { half, half } %result, 0
   ret half %result.0
 }
 
 define half @test_sincos_f16_only_use_cos(half %a) {
-; CHECK-LABEL: test_sincos_f16_only_use_cos:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f16_only_use_cos:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16_only_use_cos:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_cos:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    pop {lr}
+; IOS-NO-STRET-NEXT:    bx lr
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_cos:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp, #4]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f16_only_use_cos:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s1
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.1 = extractvalue { half, half } %result, 1
   ret half %result.1
 }
 
 define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) {
-; CHECK-LABEL: test_sincos_v2f16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #12]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    ldr r1, [sp, #4]
-; CHECK-NEXT:    strh.w r0, [sp, #22]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    strh.w r0, [sp, #20]
-; CHECK-NEXT:    add r0, sp, #20
-; CHECK-NEXT:    vld1.32 {d8[0]}, [r0:32]
-; CHECK-NEXT:    ldr r0, [sp, #8]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    ldr r1, [sp]
-; CHECK-NEXT:    strh.w r0, [sp, #18]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    strh.w r0, [sp, #16]
-; CHECK-NEXT:    add r0, sp, #16
-; CHECK-NEXT:    vmovl.u16 q9, d8
-; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
-; CHECK-NEXT:    vmovl.u16 q8, d16
-; CHECK-NEXT:    vmov.32 r0, d18[0]
-; CHECK-NEXT:    vmov.32 r1, d18[1]
-; CHECK-NEXT:    vmov.32 r2, d16[0]
-; CHECK-NEXT:    vmov.32 r3, d16[1]
-; CHECK-NEXT:    add sp, #24
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_v2f16:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    vpush {d8}
+; GNU-NEXT:    sub sp, #24
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #12
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    mov r0, r4
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #12]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    ldr r1, [sp, #4]
+; GNU-NEXT:    strh.w r0, [sp, #22]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    strh.w r0, [sp, #20]
+; GNU-NEXT:    add r0, sp, #20
+; GNU-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; GNU-NEXT:    ldr r0, [sp, #8]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    ldr r1, [sp]
+; GNU-NEXT:    strh.w r0, [sp, #18]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    strh.w r0, [sp, #16]
+; GNU-NEXT:    add r0, sp, #16
+; GNU-NEXT:    vmovl.u16 q9, d8
+; GNU-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; GNU-NEXT:    vmovl.u16 q8, d16
+; GNU-NEXT:    vmov.32 r0, d18[0]
+; GNU-NEXT:    vmov.32 r1, d18[1]
+; GNU-NEXT:    vmov.32 r2, d16[0]
+; GNU-NEXT:    vmov.32 r3, d16[1]
+; GNU-NEXT:    add sp, #24
+; GNU-NEXT:    vpop {d8}
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f16:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .vsave {d8}
+; GNUEABI-NEXT:    vpush {d8}
+; GNUEABI-NEXT:    .pad #24
+; GNUEABI-NEXT:    sub sp, sp, #24
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #12
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    mov r0, r4
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #12]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    ldr r1, [sp, #4]
+; GNUEABI-NEXT:    strh r0, [sp, #22]
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    strh r0, [sp, #20]
+; GNUEABI-NEXT:    add r0, sp, #20
+; GNUEABI-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; GNUEABI-NEXT:    ldr r0, [sp, #8]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    ldr r1, [sp]
+; GNUEABI-NEXT:    strh r0, [sp, #18]
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    strh r0, [sp, #16]
+; GNUEABI-NEXT:    add r0, sp, #16
+; GNUEABI-NEXT:    vmovl.u16 q9, d8
+; GNUEABI-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; GNUEABI-NEXT:    vmovl.u16 q8, d16
+; GNUEABI-NEXT:    vmov.32 r0, d18[0]
+; GNUEABI-NEXT:    vmov.32 r1, d18[1]
+; GNUEABI-NEXT:    vmov.32 r2, d16[0]
+; GNUEABI-NEXT:    vmov.32 r3, d16[1]
+; GNUEABI-NEXT:    add sp, sp, #24
+; GNUEABI-NEXT:    vpop {d8}
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f16:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8}
+; IOS-NO-STRET-NEXT:    sub sp, sp, #8
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r1
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #6]
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #4]
+; IOS-NO-STRET-NEXT:    add r0, sp, #4
+; IOS-NO-STRET-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #2]
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp]
+; IOS-NO-STRET-NEXT:    mov r0, sp
+; IOS-NO-STRET-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; IOS-NO-STRET-NEXT:    vmovl.u16 q9, d8
+; IOS-NO-STRET-NEXT:    vmovl.u16 q8, d16
+; IOS-NO-STRET-NEXT:    vmov.32 r0, d18[0]
+; IOS-NO-STRET-NEXT:    vmov.32 r1, d18[1]
+; IOS-NO-STRET-NEXT:    vmov.32 r2, d16[0]
+; IOS-NO-STRET-NEXT:    vmov.32 r3, d16[1]
+; IOS-NO-STRET-NEXT:    add sp, sp, #8
+; IOS-NO-STRET-NEXT:    vpop {d8}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f16:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, lr}
+; IOS-WITH-STRET-NEXT:    vpush {d8}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #24
+; IOS-WITH-STRET-NEXT:    mov r4, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r1
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    add r0, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp, #8]
+; IOS-WITH-STRET-NEXT:    ldr r4, [sp, #12]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    ldm sp, {r1, r5}
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #22]
+; IOS-WITH-STRET-NEXT:    mov r0, r1
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #20]
+; IOS-WITH-STRET-NEXT:    add r0, sp, #20
+; IOS-WITH-STRET-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #18]
+; IOS-WITH-STRET-NEXT:    mov r0, r5
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #16]
+; IOS-WITH-STRET-NEXT:    add r0, sp, #16
+; IOS-WITH-STRET-NEXT:    vmovl.u16 q9, d8
+; IOS-WITH-STRET-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; IOS-WITH-STRET-NEXT:    vmovl.u16 q8, d16
+; IOS-WITH-STRET-NEXT:    vmov.32 r0, d18[0]
+; IOS-WITH-STRET-NEXT:    vmov.32 r1, d18[1]
+; IOS-WITH-STRET-NEXT:    vmov.32 r2, d16[0]
+; IOS-WITH-STRET-NEXT:    vmov.32 r3, d16[1]
+; IOS-WITH-STRET-NEXT:    add sp, sp, #24
+; IOS-WITH-STRET-NEXT:    vpop {d8}
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
+;
+; WATCHABI-LABEL: test_sincos_v2f16:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d10}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vpush {d8}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 24
+; WATCHABI-NEXT:    .cfi_offset d10, -16
+; WATCHABI-NEXT:    .cfi_offset d8, -24
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    vmov.f32 s16, s0
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s1
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vcvtb.f32.f16 s4, s16
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vmov.f32 s0, s4
+; WATCHABI-NEXT:    vmov.f32 s20, s1
+; WATCHABI-NEXT:    strh.w r0, [sp, #6]
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s20
+; WATCHABI-NEXT:    strh.w r0, [sp, #4]
+; WATCHABI-NEXT:    add r0, sp, #4
+; WATCHABI-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s1
+; WATCHABI-NEXT:    strh.w r0, [sp, #2]
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vmovl.u16 q0, d16
+; WATCHABI-NEXT:    strh.w r0, [sp]
+; WATCHABI-NEXT:    mov r0, sp
+; WATCHABI-NEXT:    vld1.32 {d18[0]}, [r0:32]
+; WATCHABI-NEXT:    vmovl.u16 q1, d18
+; WATCHABI-NEXT:    vmov.f32 s2, s4
+; WATCHABI-NEXT:    vmov.f32 s3, s5
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    vpop {d8}
+; WATCHABI-NEXT:    vpop {d10}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a)
   ret { <2 x half>, <2 x half> } %result
 }
 
 define { float, float } @test_sincos_f32(float %a) {
-; CHECK-LABEL: test_sincos_f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldrd r1, r0, [sp], #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f32:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldrd r1, r0, [sp], #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f32:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    ldr r1, [sp], #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f32:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    mov r1, r0
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f32:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    pop {r0, r1}
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f32:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { float, float } @llvm.sincos.f32(float %a)
   ret { float, float } %result
 }
 
 define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_sincos_v2f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vmov d8, r0, r1
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    vldr s1, [sp, #4]
-; CHECK-NEXT:    vldr s3, [sp]
-; CHECK-NEXT:    vldr s0, [sp, #12]
-; CHECK-NEXT:    vldr s2, [sp, #8]
-; CHECK-NEXT:    vmov r0, r1, d0
-; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_v2f32:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    vpush {d8}
+; GNU-NEXT:    sub sp, #16
+; GNU-NEXT:    vmov d8, r0, r1
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    vmov r0, s17
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    vmov r0, s16
+; GNU-NEXT:    add r1, sp, #12
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    vldr s1, [sp, #4]
+; GNU-NEXT:    vldr s3, [sp]
+; GNU-NEXT:    vldr s0, [sp, #12]
+; GNU-NEXT:    vldr s2, [sp, #8]
+; GNU-NEXT:    vmov r0, r1, d0
+; GNU-NEXT:    vmov r2, r3, d1
+; GNU-NEXT:    add sp, #16
+; GNU-NEXT:    vpop {d8}
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f32:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .vsave {d8}
+; GNUEABI-NEXT:    vpush {d8}
+; GNUEABI-NEXT:    .pad #16
+; GNUEABI-NEXT:    sub sp, sp, #16
+; GNUEABI-NEXT:    vmov d8, r0, r1
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    vmov r0, s17
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    vmov r0, s16
+; GNUEABI-NEXT:    add r1, sp, #12
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    vldr s1, [sp, #4]
+; GNUEABI-NEXT:    vldr s3, [sp]
+; GNUEABI-NEXT:    vldr s0, [sp, #12]
+; GNUEABI-NEXT:    vldr s2, [sp, #8]
+; GNUEABI-NEXT:    vmov r0, r1, d0
+; GNUEABI-NEXT:    vmov r2, r3, d1
+; GNUEABI-NEXT:    add sp, sp, #16
+; GNUEABI-NEXT:    vpop {d8}
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f32:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8}
+; IOS-NO-STRET-NEXT:    vmov d8, r0, r1
+; IOS-NO-STRET-NEXT:    vmov r4, s17
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    vmov r6, s16
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r7, r0
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    mov r2, r0
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r5
+; IOS-NO-STRET-NEXT:    mov r3, r4
+; IOS-NO-STRET-NEXT:    vpop {d8}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f32:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    vpush {d8}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #16
+; IOS-WITH-STRET-NEXT:    vmov d8, r0, r1
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    vmov r1, s17
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    vmov r1, s16
+; IOS-WITH-STRET-NEXT:    add r0, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    vldr s1, [sp]
+; IOS-WITH-STRET-NEXT:    vldr s3, [sp, #4]
+; IOS-WITH-STRET-NEXT:    vldr s0, [sp, #8]
+; IOS-WITH-STRET-NEXT:    vldr s2, [sp, #12]
+; IOS-WITH-STRET-NEXT:    vmov r0, r1, d0
+; IOS-WITH-STRET-NEXT:    vmov r2, r3, d1
+; IOS-WITH-STRET-NEXT:    add sp, sp, #16
+; IOS-WITH-STRET-NEXT:    vpop {d8}
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_v2f32:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d8, d9, d10}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    .cfi_offset d10, -16
+; WATCHABI-NEXT:    .cfi_offset d9, -24
+; WATCHABI-NEXT:    .cfi_offset d8, -32
+; WATCHABI-NEXT:    vmov.f64 d8, d0
+; WATCHABI-NEXT:    vmov.f32 s0, s17
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vmov.f32 s19, s0
+; WATCHABI-NEXT:    vmov.f32 s0, s16
+; WATCHABI-NEXT:    vmov.f32 s21, s1
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vmov.f32 s20, s1
+; WATCHABI-NEXT:    vmov.f32 s18, s0
+; WATCHABI-NEXT:    vmov.f64 d1, d10
+; WATCHABI-NEXT:    vmov.f64 d0, d9
+; WATCHABI-NEXT:    vpop {d8, d9, d10}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
 }
 
 define { double, double } @test_sincos_f64(double %a) {
-; CHECK-LABEL: test_sincos_f64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldrd r0, r1, [sp, #8]
-; CHECK-NEXT:    ldrd r2, r3, [sp], #16
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f64:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #16
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    mov r3, sp
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    ldrd r0, r1, [sp, #8]
+; GNU-NEXT:    ldrd r2, r3, [sp], #16
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f64:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #16
+; GNUEABI-NEXT:    sub sp, sp, #16
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    mov r3, sp
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    ldm sp, {r2, r3}
+; GNUEABI-NEXT:    ldr r0, [sp, #8]
+; GNUEABI-NEXT:    ldr r1, [sp, #12]
+; GNUEABI-NEXT:    add sp, sp, #16
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f64:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, lr}
+; IOS-NO-STRET-NEXT:    mov r4, r1
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    mov r6, r0
+; IOS-NO-STRET-NEXT:    mov r7, r1
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    mov r1, r4
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    mov r2, r0
+; IOS-NO-STRET-NEXT:    mov r3, r1
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    mov r1, r7
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f64:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #16
+; IOS-WITH-STRET-NEXT:    mov r2, r1
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    vldr d16, [sp, #8]
+; IOS-WITH-STRET-NEXT:    ldm sp, {r0, r1}
+; IOS-WITH-STRET-NEXT:    vmov r2, r3, d16
+; IOS-WITH-STRET-NEXT:    add sp, sp, #16
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f64:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { double, double } @llvm.sincos.f64(double %a)
   ret { double, double } %result
 }
 
 define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    mov r1, r3
-; CHECK-NEXT:    mov r12, r2
-; CHECK-NEXT:    add r2, sp, #24
-; CHECK-NEXT:    add r3, sp, #16
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldrd r0, r1, [sp, #40]
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    vldr d19, [sp, #8]
-; CHECK-NEXT:    vldr d18, [sp, #24]
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vldr d16, [sp, #16]
-; CHECK-NEXT:    vst1.64 {d18, d19}, [r4]!
-; CHECK-NEXT:    vst1.64 {d16, d17}, [r4]
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_v2f64:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    sub sp, #32
+; GNU-NEXT:    mov r1, r3
+; GNU-NEXT:    mov r12, r2
+; GNU-NEXT:    add r2, sp, #24
+; GNU-NEXT:    add r3, sp, #16
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    mov r0, r12
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    ldrd r0, r1, [sp, #40]
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    mov r3, sp
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    vldr d19, [sp, #8]
+; GNU-NEXT:    vldr d18, [sp, #24]
+; GNU-NEXT:    vldr d17, [sp]
+; GNU-NEXT:    vldr d16, [sp, #16]
+; GNU-NEXT:    vst1.64 {d18, d19}, [r4]!
+; GNU-NEXT:    vst1.64 {d16, d17}, [r4]
+; GNU-NEXT:    add sp, #32
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f64:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .pad #32
+; GNUEABI-NEXT:    sub sp, sp, #32
+; GNUEABI-NEXT:    mov r1, r3
+; GNUEABI-NEXT:    mov r12, r2
+; GNUEABI-NEXT:    add r2, sp, #24
+; GNUEABI-NEXT:    add r3, sp, #16
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    mov r0, r12
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    ldr r0, [sp, #40]
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    ldr r1, [sp, #44]
+; GNUEABI-NEXT:    mov r3, sp
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    vldr d19, [sp, #8]
+; GNUEABI-NEXT:    vldr d18, [sp, #24]
+; GNUEABI-NEXT:    vldr d17, [sp]
+; GNUEABI-NEXT:    vldr d16, [sp, #16]
+; GNUEABI-NEXT:    vst1.64 {d18, d19}, [r4]!
+; GNUEABI-NEXT:    vst1.64 {d16, d17}, [r4]
+; GNUEABI-NEXT:    add sp, sp, #32
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f64:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, r8, r10, r11, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8, d9, d10, d11}
+; IOS-NO-STRET-NEXT:    ldr r8, [sp, #64]
+; IOS-NO-STRET-NEXT:    mov r7, r1
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    mov r0, r3
+; IOS-NO-STRET-NEXT:    mov r6, r3
+; IOS-NO-STRET-NEXT:    mov r10, r2
+; IOS-NO-STRET-NEXT:    mov r1, r8
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    mov r11, r0
+; IOS-NO-STRET-NEXT:    mov r5, r1
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    mov r1, r8
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    vmov d9, r0, r1
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r10
+; IOS-NO-STRET-NEXT:    vmov d11, r11, r5
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    vmov d10, r0, r1
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r10
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    vmov d8, r0, r1
+; IOS-NO-STRET-NEXT:    vst1.32 {d10, d11}, [r4]!
+; IOS-NO-STRET-NEXT:    vst1.32 {d8, d9}, [r4]
+; IOS-NO-STRET-NEXT:    vpop {d8, d9, d10, d11}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, r8, r10, r11, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f64:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, r6, lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #32
+; IOS-WITH-STRET-NEXT:    mov r4, r2
+; IOS-WITH-STRET-NEXT:    ldr r2, [sp, #48]
+; IOS-WITH-STRET-NEXT:    mov r6, r0
+; IOS-WITH-STRET-NEXT:    add r0, sp, #16
+; IOS-WITH-STRET-NEXT:    mov r5, r1
+; IOS-WITH-STRET-NEXT:    mov r1, r3
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    mov r1, r5
+; IOS-WITH-STRET-NEXT:    mov r2, r4
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    vldr d17, [sp, #16]
+; IOS-WITH-STRET-NEXT:    vldr d16, [sp]
+; IOS-WITH-STRET-NEXT:    vldr d19, [sp, #24]
+; IOS-WITH-STRET-NEXT:    vldr d18, [sp, #8]
+; IOS-WITH-STRET-NEXT:    vst1.32 {d16, d17}, [r6]!
+; IOS-WITH-STRET-NEXT:    vst1.32 {d18, d19}, [r6]
+; IOS-WITH-STRET-NEXT:    add sp, sp, #32
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, r6, pc}
+;
+; WATCHABI-LABEL: test_sincos_v2f64:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 56
+; WATCHABI-NEXT:    .cfi_offset d13, -16
+; WATCHABI-NEXT:    .cfi_offset d12, -24
+; WATCHABI-NEXT:    .cfi_offset d11, -32
+; WATCHABI-NEXT:    .cfi_offset d10, -40
+; WATCHABI-NEXT:    .cfi_offset d9, -48
+; WATCHABI-NEXT:    .cfi_offset d8, -56
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 64
+; WATCHABI-NEXT:    vorr q4, q0, q0
+; WATCHABI-NEXT:    vorr d0, d9, d9
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    vorr d11, d0, d0
+; WATCHABI-NEXT:    vorr d0, d8, d8
+; WATCHABI-NEXT:    vorr d13, d1, d1
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    vorr d12, d1, d1
+; WATCHABI-NEXT:    vorr d10, d0, d0
+; WATCHABI-NEXT:    vorr q1, q6, q6
+; WATCHABI-NEXT:    vorr q0, q5, q5
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
 
 define { fp128, fp128 } @test_sincos_f128(fp128 %a) {
-; CHECK-LABEL: test_sincos_f128:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    sub sp, #40
-; CHECK-NEXT:    mov r12, r3
-; CHECK-NEXT:    ldr r3, [sp, #56]
-; CHECK-NEXT:    add.w lr, sp, #8
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add r0, sp, #24
-; CHECK-NEXT:    strd r0, lr, [sp]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    bl sincosl
-; CHECK-NEXT:    ldrd r2, r3, [sp, #16]
-; CHECK-NEXT:    ldrd r12, r1, [sp, #8]
-; CHECK-NEXT:    str r3, [r4, #28]
-; CHECK-NEXT:    ldrd r3, r5, [sp, #32]
-; CHECK-NEXT:    ldrd lr, r0, [sp, #24]
-; CHECK-NEXT:    strd r1, r2, [r4, #20]
-; CHECK-NEXT:    add.w r1, r4, #8
-; CHECK-NEXT:    stm.w r1, {r3, r5, r12}
-; CHECK-NEXT:    strd lr, r0, [r4]
-; CHECK-NEXT:    add sp, #40
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; GNU-LABEL: test_sincos_f128:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, r5, r7, lr}
+; GNU-NEXT:    sub sp, #40
+; GNU-NEXT:    mov r12, r3
+; GNU-NEXT:    ldr r3, [sp, #56]
+; GNU-NEXT:    add.w lr, sp, #8
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    add r0, sp, #24
+; GNU-NEXT:    strd r0, lr, [sp]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    mov r1, r2
+; GNU-NEXT:    mov r2, r12
+; GNU-NEXT:    bl sincosl
+; GNU-NEXT:    ldrd r2, r3, [sp, #16]
+; GNU-NEXT:    ldrd r12, r1, [sp, #8]
+; GNU-NEXT:    str r3, [r4, #28]
+; GNU-NEXT:    ldrd r3, r5, [sp, #32]
+; GNU-NEXT:    ldrd lr, r0, [sp, #24]
+; GNU-NEXT:    strd r1, r2, [r4, #20]
+; GNU-NEXT:    add.w r1, r4, #8
+; GNU-NEXT:    stm.w r1, {r3, r5, r12}
+; GNU-NEXT:    strd lr, r0, [r4]
+; GNU-NEXT:    add sp, #40
+; GNU-NEXT:    pop {r4, r5, r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f128:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, r5, r11, lr}
+; GNUEABI-NEXT:    push {r4, r5, r11, lr}
+; GNUEABI-NEXT:    .pad #40
+; GNUEABI-NEXT:    sub sp, sp, #40
+; GNUEABI-NEXT:    mov r12, r3
+; GNUEABI-NEXT:    ldr r3, [sp, #56]
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    add r0, sp, #24
+; GNUEABI-NEXT:    add r5, sp, #8
+; GNUEABI-NEXT:    stm sp, {r0, r5}
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    mov r1, r2
+; GNUEABI-NEXT:    mov r2, r12
+; GNUEABI-NEXT:    bl sincosl
+; GNUEABI-NEXT:    add r3, sp, #12
+; GNUEABI-NEXT:    ldr r12, [sp, #8]
+; GNUEABI-NEXT:    ldm r3, {r1, r2, r3}
+; GNUEABI-NEXT:    str r3, [r4, #28]
+; GNUEABI-NEXT:    ldr r0, [sp, #32]
+; GNUEABI-NEXT:    ldr lr, [sp, #24]
+; GNUEABI-NEXT:    ldr r5, [sp, #28]
+; GNUEABI-NEXT:    ldr r3, [sp, #36]
+; GNUEABI-NEXT:    str r2, [r4, #24]
+; GNUEABI-NEXT:    str r1, [r4, #20]
+; GNUEABI-NEXT:    add r1, r4, #8
+; GNUEABI-NEXT:    stm r1, {r0, r3, r12}
+; GNUEABI-NEXT:    str r5, [r4, #4]
+; GNUEABI-NEXT:    str lr, [r4]
+; GNUEABI-NEXT:    add sp, sp, #40
+; GNUEABI-NEXT:    pop {r4, r5, r11, pc}
+;
+; IOS-LABEL: test_sincos_f128:
+; IOS:       @ %bb.0:
+; IOS-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; IOS-NEXT:    ldr r8, [sp, #24]
+; IOS-NEXT:    mov r4, r0
+; IOS-NEXT:    mov r5, r3
+; IOS-NEXT:    mov r6, r2
+; IOS-NEXT:    mov r7, r1
+; IOS-NEXT:    mov r0, r1
+; IOS-NEXT:    mov r1, r2
+; IOS-NEXT:    mov r2, r3
+; IOS-NEXT:    mov r3, r8
+; IOS-NEXT:    bl _cosl
+; IOS-NEXT:    add r9, r4, #16
+; IOS-NEXT:    stm r9, {r0, r1, r2, r3}
+; IOS-NEXT:    mov r0, r7
+; IOS-NEXT:    mov r1, r6
+; IOS-NEXT:    mov r2, r5
+; IOS-NEXT:    mov r3, r8
+; IOS-NEXT:    bl _sinl
+; IOS-NEXT:    stm r4, {r0, r1, r2, r3}
+; IOS-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; WATCHABI-LABEL: test_sincos_f128:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 24
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    .cfi_offset r6, -12
+; WATCHABI-NEXT:    .cfi_offset r5, -16
+; WATCHABI-NEXT:    .cfi_offset r4, -20
+; WATCHABI-NEXT:    .cfi_offset r8, -24
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    ldr.w r8, [sp, #32]
+; WATCHABI-NEXT:    mov r4, r0
+; WATCHABI-NEXT:    mov r5, r3
+; WATCHABI-NEXT:    mov r6, r2
+; WATCHABI-NEXT:    mov r7, r1
+; WATCHABI-NEXT:    mov r0, r1
+; WATCHABI-NEXT:    mov r1, r2
+; WATCHABI-NEXT:    mov r2, r3
+; WATCHABI-NEXT:    mov r3, r8
+; WATCHABI-NEXT:    bl _cosl
+; WATCHABI-NEXT:    add.w r9, r4, #16
+; WATCHABI-NEXT:    stm.w r9, {r0, r1, r2, r3}
+; WATCHABI-NEXT:    mov r0, r7
+; WATCHABI-NEXT:    mov r1, r6
+; WATCHABI-NEXT:    mov r2, r5
+; WATCHABI-NEXT:    mov r3, r8
+; WATCHABI-NEXT:    bl _sinl
+; WATCHABI-NEXT:    stm r4!, {r0, r1, r2, r3}
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a)
   ret { fp128, fp128 } %result
 }
diff --git a/llvm/test/CodeGen/BPF/bpf_trap.ll b/llvm/test/CodeGen/BPF/bpf_trap.ll
new file mode 100644
index 0000000000000..ab8df5ff7cb0d
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/bpf_trap.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s | FileCheck %s
+;
+target triple = "bpf"
+
+define i32 @test(i8 %x) {
+entry:
+  %0 = and i8 %x, 3
+  switch i8 %0, label %default.unreachable4 [
+    i8 0, label %return
+    i8 1, label %sw.bb1
+    i8 2, label %sw.bb2
+    i8 3, label %sw.bb3
+  ]
+
+sw.bb1:                                           ; preds = %entry
+  br label %return
+
+sw.bb2:                                           ; preds = %entry
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  br label %return
+
+default.unreachable4:                             ; preds = %entry
+  unreachable
+
+return:                                           ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1
+  %retval.0 = phi i32 [ 12, %sw.bb1 ], [ 43, %sw.bb2 ], [ 54, %sw.bb3 ], [ 32, %entry ]
+  ret i32 %retval.0
+}
+
+; CHECK-NOT: __bpf_trap
diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
index d5a1d63b644a8..b7d518639d70e 100644
--- a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
@@ -84,8 +84,8 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_3
+; CHECK: 	.quad	LBB0_3-.text
 ; CHECK: 	.size	BPF.JT.0.0, 8
 ; CHECK: BPF.JT.0.1:
-; CHECK: 	.quad	LBB0_4
+; CHECK: 	.quad	LBB0_4-.text
 ; CHECK: 	.size	BPF.JT.0.1, 8
diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
index bbca46850843b..71c682f5530ed 100644
--- a/llvm/test/CodeGen/BPF/jump_table_global_var.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
@@ -78,6 +78,6 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_1
-; CHECK: 	.quad	LBB0_2
+; CHECK: 	.quad	LBB0_1-.text
+; CHECK: 	.quad	LBB0_2-.text
 ; CHECK: 	.size	BPF.JT.0.0, 16
diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
index 682b025d665d6..eb1e5bff11013 100644
--- a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
@@ -93,34 +93,34 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"",@progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_4
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_2
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_5
-; CHECK: 	.quad	LBB0_3
+; CHECK: 	.quad	LBB0_4-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_2-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_5-.text
+; CHECK: 	.quad	LBB0_3-.text
 ; CHECK: 	.size	BPF.JT.0.0, 240
diff --git a/llvm/test/CodeGen/DirectX/f16tof32.ll b/llvm/test/CodeGen/DirectX/f16tof32.ll
new file mode 100644
index 0000000000000..edc5c1942e8bd
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/f16tof32.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.9-library %s | FileCheck %s
+
+define hidden noundef nofpclass(nan inf) float @_Z11test_scalarj(i32 noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK : [[UINT:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 %p0)
+  ; CHECK : ret float [[UINT]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn float @llvm.dx.legacyf16tof32.i32(i32 %p0)
+  ret float %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <2 x float> @_Z10test_uint2Dv2_j(<2 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT2_0:%.*]] = extractelement <2 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_0]])
+  ; CHECK: [[UINT2_1:%.*]] = extractelement <2 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT2_1]])
+  ; CHECK: [[FLOAT2_0:%.*]] = insertelement <2 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT2_1:%.*]] = insertelement <2 x float> [[FLOAT2_0]], float [[FLOAT_1]], i64 1
+  ; CHECK : ret <2 x float>  [[FLOAT2_1]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.dx.legacyf16tof32.v2i32(<2 x i32> %p0)
+  ret <2 x float> %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <3 x float> @_Z10test_uint3Dv3_j(<3 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT3_0:%.*]] = extractelement <3 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_0]])
+  ; CHECK: [[UINT3_1:%.*]] = extractelement <3 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_1]])
+  ; CHECK: [[UINT3_2:%.*]] = extractelement <3 x i32> %p0, i64 2
+  ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT3_2]])
+  ; CHECK: [[FLOAT3_0:%.*]] = insertelement <3 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT3_1:%.*]] = insertelement <3 x float> [[FLOAT3_0]], float [[FLOAT_1]], i64 1
+  ; CHECK: [[FLOAT3_2:%.*]] = insertelement <3 x float> [[FLOAT3_1]], float [[FLOAT_2]], i64 2
+  ; CHECK : ret <3 x float> [[FLOAT3_2]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.dx.legacyf16tof32.v3i32(<3 x i32> %p0)
+  ret <3 x float> %hlsl.f16tof32
+}
+
+define hidden noundef nofpclass(nan inf) <4 x float> @_Z10test_uint4Dv4_j(<4 x i32> noundef %p0) local_unnamed_addr #0 {
+entry:
+  ; CHECK: [[UINT4_0:%.*]] = extractelement <4 x i32> %p0, i64 0
+  ; CHECK: [[FLOAT_0:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_0]])
+  ; CHECK: [[UINT4_1:%.*]] = extractelement <4 x i32> %p0, i64 1
+  ; CHECK: [[FLOAT_1:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_1]])
+  ; CHECK: [[UINT4_2:%.*]] = extractelement <4 x i32> %p0, i64 2
+  ; CHECK: [[FLOAT_2:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_2]])
+  ; CHECK: [[UINT4_3:%.*]] = extractelement <4 x i32> %p0, i64 3
+  ; CHECK: [[FLOAT_3:%.*]] = call float @dx.op.legacyF16ToF32(i32 131, i32 [[UINT4_3]])
+  ; CHECK: [[FLOAT4_0:%.*]] = insertelement <4 x float> poison, float [[FLOAT_0]], i64 0
+  ; CHECK: [[FLOAT4_1:%.*]] = insertelement <4 x float> [[FLOAT4_0]], float [[FLOAT_1]], i64 1
+  ; CHECK: [[FLOAT4_2:%.*]] = insertelement <4 x float> [[FLOAT4_1]], float [[FLOAT_2]], i64 2
+  ; CHECK: [[FLOAT4_3:%.*]] = insertelement <4 x float> [[FLOAT4_2]], float [[FLOAT_3]], i64 3
+  ; CHECK : ret <4 x float> [[FLOAT4_3]]
+  %hlsl.f16tof32 = tail call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.dx.legacyf16tof32.v4i32(<4 x i32> %p0)
+  ret <4 x float> %hlsl.f16tof32
+}
diff --git a/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll
new file mode 100644
index 0000000000000..67d2ad72ee2f4
--- /dev/null
+++ b/llvm/test/CodeGen/Generic/machine-function-splitter-optnone.ll
@@ -0,0 +1,50 @@
+; REQUIRES: x86-registered-target
+
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -split-machine-functions -O0 -mfs-psi-cutoff=0 -mfs-count-threshold=10000 | FileCheck %s
+
+;; Check that functions with optnone attribute are not split.
+; CHECK-LABEL: foo_optnone:
+; CHECK-NOT:   .section .text.split.foo_optnone
+; CHECK-NOT:   foo_optnone.cold:
+; CHECK:       .LBB0_2:
+; CHECK:       .size   foo_optnone
+
+define void @foo_optnone(i1 zeroext %0) nounwind optnone noinline !prof !14 !section_prefix !15 {
+entry:
+  br i1 %0, label %hot, label %cold, !prof !17
+
+hot:
+  %1 = call i32 @bar()
+  br label %exit
+
+cold:
+  %2 = call i32 @baz()
+  br label %exit
+
+exit:
+  %3 = tail call i32 @qux()
+  ret void
+}
+
+declare i32 @bar()
+declare i32 @baz()
+declare i32 @qux()
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 10000}
+!4 = !{!"MaxCount", i64 10}
+!5 = !{!"MaxInternalCount", i64 1}
+!6 = !{!"MaxFunctionCount", i64 1000}
+!7 = !{!"NumCounts", i64 3}
+!8 = !{!"NumFunctions", i64 5}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13}
+!11 = !{i32 10000, i64 100, i32 1}
+!12 = !{i32 999900, i64 100, i32 1}
+!13 = !{i32 999999, i64 1, i32 2}
+!14 = !{!"function_entry_count", i64 7000}
+!15 = !{!"function_section_prefix", !"hot"}
+!17 = !{!"branch_weights", i32 7000, i32 0}
diff --git a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
index 559bb68741e12..930cf8152b756 100644
--- a/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
+++ b/llvm/test/CodeGen/Hexagon/vect/zext-v4i1.ll
@@ -6,11 +6,11 @@
 target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
 target triple = "hexagon"
 
-define i32 @fred(ptr %a0) #0 {
+define i32 @fred(ptr %a0, i32 %cond) #0 {
 ; CHECK-LABEL: fred:
 ; CHECK:       // %bb.0: // %b0
 ; CHECK-NEXT:    {
-; CHECK-NEXT:     if (p0) jump:nt .LBB0_2
+; CHECK-NEXT:     p0 = cmp.eq(r1,#5); if (!p0.new) jump:t .LBB0_2
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:  // %bb.1: // %b2
 ; CHECK-NEXT:    {
@@ -40,7 +40,7 @@ define i32 @fred(ptr %a0) #0 {
 ; CHECK-NEXT:     jumpr r31
 ; CHECK-NEXT:    }
 b0:
-  switch i32 undef, label %b14 [
+  switch i32 %cond, label %b14 [
     i32 5, label %b2
     i32 3, label %b1
   ]
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
new file mode 100644
index 0000000000000..006713ccabf47
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
@@ -0,0 +1,303 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
+
+define void @lasx_cast_128_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x float>, ptr %va
+  %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a)
+  store <8 x float> %b, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
+
+define void @lasx_cast_128_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x double>, ptr %va
+  %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a)
+  store <4 x double> %b, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
+
+define void @lasx_cast_128(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_cast_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %va
+  %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a)
+  store <4 x i64> %b, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>)
+
+define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>)
+
+define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
+
+define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_concat_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <2 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
+
+define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
+
+define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
+
+define void @lasx_extract_128_lo(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
+
+define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
+
+define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
+
+define void @lasx_extract_128_hi(ptr %vd, ptr %va) {
+; CHECK-LABEL: lasx_extract_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>)
+
+define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>)
+
+define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
+
+define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) {
+; CHECK-LABEL: lasx_insert_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
index 4d930cd9e57c0..3626613cf8511 100644
--- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
+++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM70 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | FileCheck --check-prefixes=CHECK,SM80 %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM80-FTZ %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 -denormal-fp-math-f32=preserve-sign | FileCheck --check-prefixes=CHECK,SM90-FTZ %s
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck --check-prefixes=CHECK,SM90 %s
 ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 | %ptxas-verify -arch=sm_80 %}
 ; RUN: %if ptxas-sm_80 && ptxas-isa-7.1 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx71 -denormal-fp-math-f32=preserve-sign | %ptxas-verify -arch=sm_80 %}
@@ -55,13 +56,24 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r3, %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r3;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fadd(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_fadd_param_1];
+; SM90-FTZ-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fadd(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -118,13 +130,24 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r3, %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs3, %r3;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fsub(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fsub_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_fsub_param_1];
+; SM90-FTZ-NEXT:    sub.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fsub(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -195,16 +218,27 @@ define <2 x bfloat> @test_faddx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_faddx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_faddx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_faddx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_faddx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_faddx2_param_1];
+; SM90-FTZ-NEXT:    add.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_faddx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -275,16 +309,27 @@ define <2 x bfloat> @test_fsubx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fsubx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fsubx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    sub.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fsubx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fsubx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_fsubx2_param_1];
+; SM90-FTZ-NEXT:    sub.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fsubx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -355,16 +400,27 @@ define <2 x bfloat> @test_fmulx2(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fmulx2_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fmulx2_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    mul.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fmulx2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fmulx2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_fmulx2_param_1];
+; SM90-FTZ-NEXT:    mul.rn.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fmulx2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -441,16 +497,34 @@ define <2 x bfloat> @test_fdiv(<2 x bfloat> %a, <2 x bfloat> %b) #0 {
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
 ; SM80-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r2, %rs1;
 ; SM80-FTZ-NEXT:    div.rn.ftz.f32 %r3, %r2, %r1;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r4, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs2;
 ; SM80-FTZ-NEXT:    div.rn.ftz.f32 %r6, %r5, %r4;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fdiv(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<5>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<8>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.v2.b16 {%rs1, %rs2}, [test_fdiv_param_0];
+; SM90-FTZ-NEXT:    ld.param.v2.b16 {%rs3, %rs4}, [test_fdiv_param_1];
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs3;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r2, %rs1;
+; SM90-FTZ-NEXT:    div.rn.ftz.f32 %r3, %r2, %r1;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r4, %rs4;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs2;
+; SM90-FTZ-NEXT:    div.rn.ftz.f32 %r6, %r5, %r4;
+; SM90-FTZ-NEXT:    cvt.rn.bf16x2.f32 %r7, %r6, %r3;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r7;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fdiv(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<5>;
@@ -527,10 +601,21 @@ define float @test_fpext_float(bfloat %a) #0 {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fpext_float(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fpext_float_param_0];
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fpext_float(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -585,6 +670,17 @@ define bfloat @test_fptrunc_float(float %a) #0 {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptrunc_float(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_fptrunc_float_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.f32 %rs1, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptrunc_float(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -637,12 +733,23 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    add.rn.ftz.f32 %r2, %r1, 0f3F800000;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fadd_imm_1(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fadd_imm_1_param_0];
+; SM90-FTZ-NEXT:    mov.b16 %rs2, 0x3F80;
+; SM90-FTZ-NEXT:    add.rn.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fadd_imm_1(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -750,18 +857,43 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
 ; SM80-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
 ; SM80-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs8;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs7;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs6;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs5;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs4;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs3;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs2;
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r5, %rs8;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r6, %rs7;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r7, %rs6;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r8, %rs5;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r9, %rs4;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r10, %rs3;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r11, %rs2;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r12, %rs1;
 ; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
 ; SM80-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_extload_bf16x8(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<9>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<13>;
+; SM90-FTZ-NEXT:    .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
+; SM90-FTZ-NEXT:    ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
+; SM90-FTZ-NEXT:    mov.b32 {%rs1, %rs2}, %r3;
+; SM90-FTZ-NEXT:    mov.b32 {%rs3, %rs4}, %r4;
+; SM90-FTZ-NEXT:    mov.b32 {%rs5, %rs6}, %r1;
+; SM90-FTZ-NEXT:    mov.b32 {%rs7, %rs8}, %r2;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r5, %rs8;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r6, %rs7;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r7, %rs6;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r8, %rs5;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r9, %rs4;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r10, %rs3;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r11, %rs2;
+; SM90-FTZ-NEXT:    cvt.ftz.f32.bf16 %r12, %rs1;
+; SM90-FTZ-NEXT:    st.param.v4.b32 [func_retval0+16], {%r12, %r11, %r10, %r9};
+; SM90-FTZ-NEXT:    st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_extload_bf16x8(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<9>;
@@ -825,12 +957,24 @@ define i16 @test_fptosi_i16(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rzi.ftz.s16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptosi_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptosi_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rzi.s16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptosi_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -880,12 +1024,24 @@ define i16 @test_fptoui_i16(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rzi.ftz.u16.f32 %rs2, %r1;
 ; SM80-FTZ-NEXT:    cvt.u32.u16 %r2, %rs2;
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_fptoui_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_fptoui_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rzi.u16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    cvt.u32.u16 %r1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_fptoui_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -945,6 +1101,16 @@ define bfloat @test_sitofp_i16(i16 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_sitofp_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_sitofp_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.s16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_sitofp_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1002,6 +1168,16 @@ define bfloat @test_uitofp_i8(i8 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i8(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i8_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i8(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1070,6 +1246,21 @@ define bfloat @test_uitofp_i1(i1 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i1(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .pred %p<2>;
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b8 %rs1, [test_uitofp_i1_param_0];
+; SM90-FTZ-NEXT:    and.b16 %rs2, %rs1, 1;
+; SM90-FTZ-NEXT:    setp.ne.b16 %p1, %rs2, 0;
+; SM90-FTZ-NEXT:    selp.b32 %r1, 1, 0, %p1;
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u32 %rs3, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i1(
 ; SM90:       {
 ; SM90-NEXT:    .reg .pred %p<2>;
@@ -1132,6 +1323,16 @@ define bfloat @test_uitofp_i16(i16 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i16(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_uitofp_i16_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i16(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1188,6 +1389,17 @@ define bfloat @test_uitofp_i32(i32 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i32(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b32 %r<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_uitofp_i32_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u32 %rs1, %r1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i32(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1248,6 +1460,17 @@ define bfloat @test_uitofp_i64(i64 %a) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_uitofp_i64(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<2>;
+; SM90-FTZ-NEXT:    .reg .b64 %rd<2>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b64 %rd1, [test_uitofp_i64_param_0];
+; SM90-FTZ-NEXT:    cvt.rn.bf16.u64 %rs1, %rd1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs1;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_uitofp_i64(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<2>;
@@ -1302,12 +1525,22 @@ define bfloat @test_roundeven(bfloat %a) {
 ; SM80-FTZ-EMPTY:
 ; SM80-FTZ-NEXT:  // %bb.0:
 ; SM80-FTZ-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
-; SM80-FTZ-NEXT:    cvt.ftz.f32.bf16 %r1, %rs1;
+; SM80-FTZ-NEXT:    cvt.f32.bf16 %r1, %rs1;
 ; SM80-FTZ-NEXT:    cvt.rni.ftz.f32.f32 %r2, %r1;
 ; SM80-FTZ-NEXT:    cvt.rn.bf16.f32 %rs2, %r2;
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_roundeven(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<3>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_roundeven_param_0];
+; SM90-FTZ-NEXT:    cvt.rni.bf16.bf16 %rs2, %rs1;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs2;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_roundeven(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<3>;
@@ -1372,6 +1605,17 @@ define bfloat @test_maximum(bfloat %a, bfloat %b) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maximum(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_maximum_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_maximum_param_1];
+; SM90-FTZ-NEXT:    max.NaN.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maximum(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -1430,6 +1674,17 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
 ; SM80-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maxnum(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b16 %rs<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b16 %rs1, [test_maxnum_param_0];
+; SM90-FTZ-NEXT:    ld.param.b16 %rs2, [test_maxnum_param_1];
+; SM90-FTZ-NEXT:    max.bf16 %rs3, %rs1, %rs2;
+; SM90-FTZ-NEXT:    st.param.b16 [func_retval0], %rs3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maxnum(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b16 %rs<4>;
@@ -1511,6 +1766,17 @@ define <2 x bfloat> @test_maximum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maximum_v2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_maximum_v2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_maximum_v2_param_1];
+; SM90-FTZ-NEXT:    max.NaN.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maximum_v2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
@@ -1583,6 +1849,17 @@ define <2 x bfloat> @test_maxnum_v2(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; SM80-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
 ; SM80-FTZ-NEXT:    ret;
 ;
+; SM90-FTZ-LABEL: test_maxnum_v2(
+; SM90-FTZ:       {
+; SM90-FTZ-NEXT:    .reg .b32 %r<4>;
+; SM90-FTZ-EMPTY:
+; SM90-FTZ-NEXT:  // %bb.0:
+; SM90-FTZ-NEXT:    ld.param.b32 %r1, [test_maxnum_v2_param_0];
+; SM90-FTZ-NEXT:    ld.param.b32 %r2, [test_maxnum_v2_param_1];
+; SM90-FTZ-NEXT:    max.bf16x2 %r3, %r1, %r2;
+; SM90-FTZ-NEXT:    st.param.b32 [func_retval0], %r3;
+; SM90-FTZ-NEXT:    ret;
+;
 ; SM90-LABEL: test_maxnum_v2(
 ; SM90:       {
 ; SM90-NEXT:    .reg .b32 %r<4>;
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
new file mode 100644
index 0000000000000..4d81fdc67736d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s -o /dev/null 2>&1 | FileCheck %s
+
+; Test that we get a clear error message when using an unsupported syncscope.
+
+; CHECK: NVPTX backend does not support syncscope "agent"
+; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device
+define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) {
+  %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic
+  %value = extractvalue { i32, i1 } %result, 0
+  ret i32 %value
+}
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
index 7e2f744ac1d71..94121f09e36be 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, ptr nocapture noundef writeonly %c) local_unnamed_addr #0 {
 ; CHECK-LABEL: testMultiply:
@@ -91,6 +97,91 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound
 ; CHECK-BE-NEXT:    ld r30, -16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: testMultiply:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    mflr r0
+; CHECK-LE-WACC-NEXT:    std r30, -16(r1)
+; CHECK-LE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    clrldi r0, r1, 59
+; CHECK-LE-WACC-NEXT:    subfic r0, r0, -128
+; CHECK-LE-WACC-NEXT:    mr r30, r1
+; CHECK-LE-WACC-NEXT:    stdux r1, r1, r0
+; CHECK-LE-WACC-NEXT:    stxv v30, -64(r30) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v31, -48(r30) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    lxv v31, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v30, 0(r4)
+; CHECK-LE-WACC-NEXT:    addi r3, r1, 32
+; CHECK-LE-WACC-NEXT:    std r29, -24(r30) # 8-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    vmr v2, v31
+; CHECK-LE-WACC-NEXT:    vmr v3, v30
+; CHECK-LE-WACC-NEXT:    mr r29, r5
+; CHECK-LE-WACC-NEXT:    bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_@notoc
+; CHECK-LE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, v31, v30
+; CHECK-LE-WACC-NEXT:    lxv vs0, 48(r1)
+; CHECK-LE-WACC-NEXT:    lxv vs1, 32(r1)
+; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, vs1, vs0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v5, 0(r29)
+; CHECK-LE-WACC-NEXT:    pstxv v4, 8(r29), 0
+; CHECK-LE-WACC-NEXT:    stxv v3, 16(r29)
+; CHECK-LE-WACC-NEXT:    pstxv v2, 24(r29), 0
+; CHECK-LE-WACC-NEXT:    lxv v31, -48(r30) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v30, -64(r30) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    ld r29, -24(r30) # 8-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    mr r1, r30
+; CHECK-LE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    ld r30, -16(r1)
+; CHECK-LE-WACC-NEXT:    mtlr r0
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testMultiply:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r30, -16(r1)
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    clrldi r0, r1, 59
+; CHECK-BE-WACC-NEXT:    subfic r0, r0, -224
+; CHECK-BE-WACC-NEXT:    mr r30, r1
+; CHECK-BE-WACC-NEXT:    stdux r1, r1, r0
+; CHECK-BE-WACC-NEXT:    stxv v30, -64(r30) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v31, -48(r30) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    lxv v31, 0(r3)
+; CHECK-BE-WACC-NEXT:    lxv v30, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r3, r1, 128
+; CHECK-BE-WACC-NEXT:    std r29, -24(r30) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    vmr v2, v31
+; CHECK-BE-WACC-NEXT:    vmr v3, v30
+; CHECK-BE-WACC-NEXT:    mr r29, r5
+; CHECK-BE-WACC-NEXT:    bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v31, v30
+; CHECK-BE-WACC-NEXT:    lxv vs0, 128(r1)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 144(r1)
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    vmr v1, v2
+; CHECK-BE-WACC-NEXT:    vmr v7, v4
+; CHECK-BE-WACC-NEXT:    vmr v0, v3
+; CHECK-BE-WACC-NEXT:    vmr v6, v5
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r29)
+; CHECK-BE-WACC-NEXT:    pstxv v3, 8(r29), 0
+; CHECK-BE-WACC-NEXT:    stxv v4, 16(r29)
+; CHECK-BE-WACC-NEXT:    pstxv v5, 24(r29), 0
+; CHECK-BE-WACC-NEXT:    lxv v31, -48(r30) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v30, -64(r30) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    ld r29, -24(r30) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    mr r1, r30
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    ld r30, -16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %vP = alloca <256 x i1>, align 32
   call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %vP)
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
index 059d60a9608f8..bc5d5bed36e9b 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -3,10 +3,18 @@
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
 ; RUN:   -disable-auto-paired-vec-st=false < %s | FileCheck %s \
 ; RUN:   --check-prefix=LE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \
+; RUN:   -disable-auto-paired-vec-st=false < %s | FileCheck %s \
+; RUN:   --check-prefix=LE-PAIRED-WACC
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \
 ; RUN:   FileCheck %s --check-prefix=BE-PAIRED
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr -disable-auto-paired-vec-st=false < %s | \
+; RUN:   FileCheck %s --check-prefix=BE-PAIRED-WACC
 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \
 ; RUN:   -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \
 ; RUN:   | FileCheck %s --check-prefix=LE-PWR9
@@ -36,6 +44,20 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    pstxv vs3, f@PCREL+128(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv v3, f@PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v5, f@PCREL+96(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v2, f@PCREL+80(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v4, f@PCREL+112(0), 1
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    pstxv v4, f@PCREL+176(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v5, f@PCREL+160(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v2, f@PCREL+144(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v3, f@PCREL+128(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, f@toc@ha
@@ -50,6 +72,22 @@ define dso_local void @testLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs2, 160(r3)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, f@toc@l
+; BE-PAIRED-WACC-NEXT:    lxv v3, 112(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v5, 80(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v2, 96(r3)
+; BE-PAIRED-WACC-NEXT:    lxv v4, 64(r3)
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    stxv v5, 176(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v4, 160(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v3, 144(r3)
+; BE-PAIRED-WACC-NEXT:    stxv v2, 128(r3)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, f@toc@ha
@@ -147,6 +185,25 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    stxv vs2, 16(r4)
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testXLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    paddi r5, 0, f@PCREL, 1
+; LE-PAIRED-WACC-NEXT:    sldi r3, r3, 6
+; LE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxvx v3, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxv v2, 16(r6)
+; LE-PAIRED-WACC-NEXT:    lxv v5, 32(r6)
+; LE-PAIRED-WACC-NEXT:    lxv v4, 48(r6)
+; LE-PAIRED-WACC-NEXT:    sldi r3, r4, 6
+; LE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    stxvx v3, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxv v4, 48(r4)
+; LE-PAIRED-WACC-NEXT:    stxv v5, 32(r4)
+; LE-PAIRED-WACC-NEXT:    stxv v2, 16(r4)
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testXLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r5, r2, f@toc@ha
@@ -165,6 +222,26 @@ define dso_local void @testXLdSt(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs2, 32(r4)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testXLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r5, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r5, r5, f@toc@l
+; BE-PAIRED-WACC-NEXT:    sldi r3, r3, 6
+; BE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxvx v2, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxv v5, 48(r6)
+; BE-PAIRED-WACC-NEXT:    lxv v3, 16(r6)
+; BE-PAIRED-WACC-NEXT:    lxv v4, 32(r6)
+; BE-PAIRED-WACC-NEXT:    sldi r3, r4, 6
+; BE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    stxvx v2, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxv v5, 48(r4)
+; BE-PAIRED-WACC-NEXT:    stxv v4, 32(r4)
+; BE-PAIRED-WACC-NEXT:    stxv v3, 16(r4)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testXLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r5, r2, f@toc@ha
@@ -263,6 +340,20 @@ define dso_local void @testUnalignedLdSt() {
 ; LE-PAIRED-NEXT:    pstxv vs3, f@PCREL+19(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testUnalignedLdSt:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv v3, f@PCREL+11(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v5, f@PCREL+43(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v2, f@PCREL+27(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv v4, f@PCREL+59(0), 1
+; LE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; LE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; LE-PAIRED-WACC-NEXT:    pstxv v4, f@PCREL+67(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v5, f@PCREL+51(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v2, f@PCREL+35(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv v3, f@PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testUnalignedLdSt:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, f@toc@ha
@@ -277,6 +368,22 @@ define dso_local void @testUnalignedLdSt() {
 ; BE-PAIRED-NEXT:    pstxv vs2, 51(r3), 0
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testUnalignedLdSt:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, f@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, f@toc@l
+; BE-PAIRED-WACC-NEXT:    plxv v3, 59(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v5, 27(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v2, 43(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv v4, 11(r3), 0
+; BE-PAIRED-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
+; BE-PAIRED-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; BE-PAIRED-WACC-NEXT:    pstxv v5, 67(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v4, 51(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v3, 35(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv v2, 19(r3), 0
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testUnalignedLdSt:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, f@toc@ha
@@ -381,6 +488,14 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    pstxv vs1, g@PCREL+64(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv vs0, g@PCREL+48(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv vs1, g@PCREL+32(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs0, g@PCREL+80(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs1, g@PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, g@toc@ha
@@ -391,6 +506,16 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs0, 64(r3)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, g@toc@l
+; BE-PAIRED-WACC-NEXT:    lxv vs0, 48(r3)
+; BE-PAIRED-WACC-NEXT:    lxv vs1, 32(r3)
+; BE-PAIRED-WACC-NEXT:    stxv vs0, 80(r3)
+; BE-PAIRED-WACC-NEXT:    stxv vs1, 64(r3)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, g@toc@ha
@@ -460,6 +585,19 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-NEXT:    stxv vs1, 16(r4)
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testXLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    sldi r3, r3, 5
+; LE-PAIRED-WACC-NEXT:    paddi r5, 0, g@PCREL, 1
+; LE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxvx vs0, r5, r3
+; LE-PAIRED-WACC-NEXT:    lxv vs1, 16(r6)
+; LE-PAIRED-WACC-NEXT:    sldi r3, r4, 5
+; LE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxvx vs0, r5, r3
+; LE-PAIRED-WACC-NEXT:    stxv vs1, 16(r4)
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testXLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r5, r2, g@toc@ha
@@ -474,6 +612,20 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-NEXT:    stxv vs1, 16(r4)
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testXLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r5, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    sldi r3, r3, 5
+; BE-PAIRED-WACC-NEXT:    addi r5, r5, g@toc@l
+; BE-PAIRED-WACC-NEXT:    add r6, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxvx vs0, r5, r3
+; BE-PAIRED-WACC-NEXT:    lxv vs1, 16(r6)
+; BE-PAIRED-WACC-NEXT:    sldi r3, r4, 5
+; BE-PAIRED-WACC-NEXT:    add r4, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxvx vs0, r5, r3
+; BE-PAIRED-WACC-NEXT:    stxv vs1, 16(r4)
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testXLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r5, r2, g@toc@ha
@@ -548,6 +700,14 @@ define dso_local void @testUnalignedLdStPair() {
 ; LE-PAIRED-NEXT:    pstxv vs1, g@PCREL+19(0), 1
 ; LE-PAIRED-NEXT:    blr
 ;
+; LE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
+; LE-PAIRED-WACC:       # %bb.0: # %entry
+; LE-PAIRED-WACC-NEXT:    plxv vs0, g@PCREL+27(0), 1
+; LE-PAIRED-WACC-NEXT:    plxv vs1, g@PCREL+11(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs0, g@PCREL+35(0), 1
+; LE-PAIRED-WACC-NEXT:    pstxv vs1, g@PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT:    blr
+;
 ; BE-PAIRED-LABEL: testUnalignedLdStPair:
 ; BE-PAIRED:       # %bb.0: # %entry
 ; BE-PAIRED-NEXT:    addis r3, r2, g@toc@ha
@@ -558,6 +718,16 @@ define dso_local void @testUnalignedLdStPair() {
 ; BE-PAIRED-NEXT:    pstxv vs0, 19(r3), 0
 ; BE-PAIRED-NEXT:    blr
 ;
+; BE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
+; BE-PAIRED-WACC:       # %bb.0: # %entry
+; BE-PAIRED-WACC-NEXT:    addis r3, r2, g@toc@ha
+; BE-PAIRED-WACC-NEXT:    addi r3, r3, g@toc@l
+; BE-PAIRED-WACC-NEXT:    plxv vs0, 27(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxv vs1, 11(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv vs0, 35(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxv vs1, 19(r3), 0
+; BE-PAIRED-WACC-NEXT:    blr
+;
 ; LE-PWR9-LABEL: testUnalignedLdStPair:
 ; LE-PWR9:       # %bb.0: # %entry
 ; LE-PWR9-NEXT:    addis r3, r2, g@toc@ha
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
index abc65bed5bf6c..9db8ba1c9eb09 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-spill.ll
@@ -13,6 +13,13 @@
 ; RUN:   -mcpu=pwr11 -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
 
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names -disable-auto-paired-vec-st=false \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
+
 declare <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1>, <16 x i8>, <16 x i8>)
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 declare void @foo()
@@ -119,6 +126,101 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
 ; CHECK-BE-NEXT:    ld r0, 16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: intrinsics1:
+; CHECK-LE-WACC:       # %bb.0:
+; CHECK-LE-WACC-NEXT:    mflr r0
+; CHECK-LE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    stdu r1, -176(r1)
+; CHECK-LE-WACC-NEXT:    .cfi_def_cfa_offset 176
+; CHECK-LE-WACC-NEXT:    .cfi_offset lr, 16
+; CHECK-LE-WACC-NEXT:    .cfi_offset r30, -16
+; CHECK-LE-WACC-NEXT:    .cfi_offset v28, -80
+; CHECK-LE-WACC-NEXT:    .cfi_offset v29, -64
+; CHECK-LE-WACC-NEXT:    .cfi_offset v30, -48
+; CHECK-LE-WACC-NEXT:    .cfi_offset v31, -32
+; CHECK-LE-WACC-NEXT:    stxv v28, 96(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v29, 112(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v30, 128(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    stxv v31, 144(r1) # 16-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    vmr v31, v5
+; CHECK-LE-WACC-NEXT:    vmr v29, v3
+; CHECK-LE-WACC-NEXT:    vmr v30, v4
+; CHECK-LE-WACC-NEXT:    vmr v28, v2
+; CHECK-LE-WACC-NEXT:    std r30, 160(r1) # 8-byte Folded Spill
+; CHECK-LE-WACC-NEXT:    ld r30, 272(r1)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp60, vsp62, 0
+; CHECK-LE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v4
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxvp vsp36, 64(r1)
+; CHECK-LE-WACC-NEXT:    stxvp vsp34, 32(r1)
+; CHECK-LE-WACC-NEXT:    bl foo@notoc
+; CHECK-LE-WACC-NEXT:    lxvp vsp34, 64(r1)
+; CHECK-LE-WACC-NEXT:    lxvp vsp36, 32(r1)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvf16ger2pp wacc0, v28, v30
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r30)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r30)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r30)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r30)
+; CHECK-LE-WACC-NEXT:    lxv v31, 144(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v30, 128(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v29, 112(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    lxv v28, 96(r1) # 16-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    ld r30, 160(r1) # 8-byte Folded Reload
+; CHECK-LE-WACC-NEXT:    addi r1, r1, 176
+; CHECK-LE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-LE-WACC-NEXT:    mtlr r0
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics1:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    stdu r1, -256(r1)
+; CHECK-BE-WACC-NEXT:    .cfi_def_cfa_offset 256
+; CHECK-BE-WACC-NEXT:    .cfi_offset lr, 16
+; CHECK-BE-WACC-NEXT:    .cfi_offset r30, -16
+; CHECK-BE-WACC-NEXT:    .cfi_offset v28, -80
+; CHECK-BE-WACC-NEXT:    .cfi_offset v29, -64
+; CHECK-BE-WACC-NEXT:    .cfi_offset v30, -48
+; CHECK-BE-WACC-NEXT:    .cfi_offset v31, -32
+; CHECK-BE-WACC-NEXT:    stxv v28, 176(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v29, 192(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v30, 208(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    stxv v31, 224(r1) # 16-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    vmr v31, v5
+; CHECK-BE-WACC-NEXT:    vmr v29, v3
+; CHECK-BE-WACC-NEXT:    vmr v30, v4
+; CHECK-BE-WACC-NEXT:    vmr v28, v2
+; CHECK-BE-WACC-NEXT:    std r30, 240(r1) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    ld r30, 368(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp60, vsp62, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxvp vsp36, 112(r1)
+; CHECK-BE-WACC-NEXT:    stxvp vsp34, 144(r1)
+; CHECK-BE-WACC-NEXT:    bl foo
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    lxvp vsp34, 112(r1)
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 144(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v28, v30
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r30)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r30)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r30)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r30)
+; CHECK-BE-WACC-NEXT:    lxv v31, 224(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v30, 208(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v29, 192(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    lxv v28, 176(r1) # 16-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    ld r30, 240(r1) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    addi r1, r1, 256
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
   %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i8> %vc4)
   %2 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc3)
   tail call void @foo()
diff --git a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
index e932aec2c7134..7b36fa4f64f71 100644
--- a/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-integer-based-outer-product.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-LE-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 ; Function Attrs: nofree nounwind writeonly
 define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone %vpp, <16 x i8> %vc, ptr nocapture %resp) {
@@ -27,6 +33,26 @@ define dso_local void @test1(ptr nocapture readnone %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test1:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    xvi16ger2 wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi16ger2 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -57,6 +83,26 @@ define dso_local void @test2(ptr nocapture readnone %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test2:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    pmxvi16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -97,6 +143,36 @@ define dso_local void @test3(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test3:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvi8ger4spp wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4spp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -138,6 +214,36 @@ define dso_local void @test4(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test4:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    xvi16ger2pp wacc0, v2, v2
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi16ger2pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -179,6 +285,36 @@ define dso_local void @test5(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test5:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test5:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4spp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -220,6 +356,36 @@ define dso_local void @test6(ptr nocapture readonly %vqp, ptr nocapture readnone
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-LE-WACC-LABEL: test6:
+; CHECK-LE-WACC:       # %bb.0: # %entry
+; CHECK-LE-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-LE-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-LE-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-LE-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-LE-WACC-NEXT:    pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-LE-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-LE-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-LE-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-LE-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-LE-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test6:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
index 8fbc9d785796d..3505cbb197bf9 100644
--- a/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-intrinsics.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 ; assemble_acc
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
@@ -32,6 +38,28 @@ define void @ass_acc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: ass_acc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: ass_acc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %ptr, align 64
@@ -66,6 +94,28 @@ define void @int_xxmtacc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxmtacc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxmtacc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
 ; One xxmtacc is generated from the call to assemble.acc then one xxmtacc is
 ; generated from the call to xxmtacc then one xxmfacc is generated for the store
@@ -101,6 +151,28 @@ define void @int_xxmfacc(ptr %ptr, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxmfacc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxmfacc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp34, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
 ; One xxmtacc is generated from the call to assemble.acc then one xxmfacc is
 ; generated from the call to xxmfacc then one xxmfacc is generated for the store
@@ -132,6 +204,26 @@ define void @int_xxsetaccz(ptr %ptr) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: int_xxsetaccz:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: int_xxsetaccz:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   store <512 x i1> %0, ptr %ptr, align 64
@@ -160,6 +252,26 @@ define void @disass_acc(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4) {
 ; CHECK-BE-NEXT:    stxv vs2, 0(r5)
 ; CHECK-BE-NEXT:    stxv vs3, 0(r6)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: disass_acc:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 0(r4)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    stxv v2, 0(r6)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: disass_acc:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %0)
@@ -219,6 +331,50 @@ define void @testBranch(ptr %ptr, <16 x i8> %vc, i32 %val) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testBranch:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmplwi r7, 0
+; CHECK-WACC-NEXT:    beq cr0, .LBB5_2
+; CHECK-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    b .LBB5_3
+; CHECK-WACC-NEXT:  .LBB5_2: # %if.else
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-WACC-NEXT:  .LBB5_3: # %if.end
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testBranch:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmplwi r7, 0
+; CHECK-BE-WACC-NEXT:    beq cr0, .LBB5_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    b .LBB5_3
+; CHECK-BE-WACC-NEXT:  .LBB5_2: # %if.else
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:  .LBB5_3: # %if.end
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %tobool = icmp eq i32 %val, 0
   br i1 %tobool, label %if.else, label %if.then
@@ -273,6 +429,36 @@ define void @testcse(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -320,6 +506,42 @@ define void @testcse2(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -367,6 +589,42 @@ define void @testcse3(ptr %res, <16 x i8> %vc) {
 ; CHECK-BE-NEXT:    stxv vs3, 112(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 96(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r3)
+; CHECK-WACC-NEXT:    stxv v5, 96(r3)
+; CHECK-WACC-NEXT:    stxv v2, 80(r3)
+; CHECK-WACC-NEXT:    stxv v3, 64(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc1, v2, v2
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   %1 = call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -475,6 +733,104 @@ define void @testcse4(ptr %res, i32 %lim, ptr %vc) {
 ; CHECK-BE-NEXT:    bdnz .LBB9_2
 ; CHECK-BE-NEXT:  # %bb.3: # %for.cond.cleanup
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testcse4:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    bltlr cr0
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r4, r4, 32
+; CHECK-WACC-NEXT:    mtctr r4
+; CHECK-WACC-NEXT:    li r4, 0
+; CHECK-WACC-NEXT:    li r6, 0
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB9_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    rldic r7, r6, 4, 28
+; CHECK-WACC-NEXT:    add r8, r5, r7
+; CHECK-WACC-NEXT:    lxvx vs0, r5, r7
+; CHECK-WACC-NEXT:    lxv vs1, 16(r8)
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc2
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc2, vs0, vs1
+; CHECK-WACC-NEXT:    lxv vs0, 32(r8)
+; CHECK-WACC-NEXT:    lxv vs1, 48(r8)
+; CHECK-WACC-NEXT:    rldic r7, r4, 6, 26
+; CHECK-WACC-NEXT:    addi r4, r4, 3
+; CHECK-WACC-NEXT:    addi r6, r6, 6
+; CHECK-WACC-NEXT:    xvf32gerpn wacc1, vs0, vs1
+; CHECK-WACC-NEXT:    lxv vs0, 64(r8)
+; CHECK-WACC-NEXT:    lxv vs1, 80(r8)
+; CHECK-WACC-NEXT:    add r8, r3, r7
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, vs0, vs1
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc2, 0
+; CHECK-WACC-NEXT:    stxvx v3, r3, r7
+; CHECK-WACC-NEXT:    stxv v4, 48(r8)
+; CHECK-WACC-NEXT:    stxv v5, 32(r8)
+; CHECK-WACC-NEXT:    stxv v2, 16(r8)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r8)
+; CHECK-WACC-NEXT:    stxv v5, 96(r8)
+; CHECK-WACC-NEXT:    stxv v2, 80(r8)
+; CHECK-WACC-NEXT:    stxv v3, 64(r8)
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 176(r8)
+; CHECK-WACC-NEXT:    stxv v5, 160(r8)
+; CHECK-WACC-NEXT:    stxv v2, 144(r8)
+; CHECK-WACC-NEXT:    stxv v3, 128(r8)
+; CHECK-WACC-NEXT:    bdnz .LBB9_2
+; CHECK-WACC-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testcse4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    bltlr cr0
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r4, r4, 32
+; CHECK-BE-WACC-NEXT:    mtctr r4
+; CHECK-BE-WACC-NEXT:    li r4, 0
+; CHECK-BE-WACC-NEXT:    li r6, 0
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB9_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    rldic r7, r6, 4, 28
+; CHECK-BE-WACC-NEXT:    add r8, r5, r7
+; CHECK-BE-WACC-NEXT:    lxvx vs0, r5, r7
+; CHECK-BE-WACC-NEXT:    lxv vs1, 16(r8)
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc2
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc1
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc2, vs0, vs1
+; CHECK-BE-WACC-NEXT:    lxv vs0, 32(r8)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 48(r8)
+; CHECK-BE-WACC-NEXT:    rldic r7, r4, 6, 26
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 3
+; CHECK-BE-WACC-NEXT:    addi r6, r6, 6
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc1, vs0, vs1
+; CHECK-BE-WACC-NEXT:    lxv vs0, 64(r8)
+; CHECK-BE-WACC-NEXT:    lxv vs1, 80(r8)
+; CHECK-BE-WACC-NEXT:    add r8, r3, r7
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc2, 0
+; CHECK-BE-WACC-NEXT:    stxvx v2, r3, r7
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r8)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc1, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r8)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r8)
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 176(r8)
+; CHECK-BE-WACC-NEXT:    stxv v4, 160(r8)
+; CHECK-BE-WACC-NEXT:    stxv v3, 144(r8)
+; CHECK-BE-WACC-NEXT:    stxv v2, 128(r8)
+; CHECK-BE-WACC-NEXT:    bdnz .LBB9_2
+; CHECK-BE-WACC-NEXT:  # %bb.3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %cmp55 = icmp sgt i32 %lim, 0
   br i1 %cmp55, label %for.body.preheader, label %for.cond.cleanup
@@ -600,6 +956,71 @@ define void @testRedundantPrimeUnprime(ptr %dst, <16 x i8> %vc) nounwind {
 ; CHECK-BE-NEXT:    ld r0, 16(r1)
 ; CHECK-BE-NEXT:    mtlr r0
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testRedundantPrimeUnprime:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    mflr r0
+; CHECK-WACC-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
+; CHECK-WACC-NEXT:    std r0, 16(r1)
+; CHECK-WACC-NEXT:    stdu r1, -112(r1)
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v0, 48(r3)
+; CHECK-WACC-NEXT:    stxv v1, 32(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-WACC-NEXT:    mr r30, r3
+; CHECK-WACC-NEXT:    stxvp vsp36, 64(r1)
+; CHECK-WACC-NEXT:    stxvp vsp34, 32(r1)
+; CHECK-WACC-NEXT:    bl testRedundantPrimeUnprimeF@notoc
+; CHECK-WACC-NEXT:    lxvp vsp34, 64(r1)
+; CHECK-WACC-NEXT:    lxvp vsp36, 32(r1)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 112(r30)
+; CHECK-WACC-NEXT:    stxv v5, 96(r30)
+; CHECK-WACC-NEXT:    stxv v2, 80(r30)
+; CHECK-WACC-NEXT:    stxv v3, 64(r30)
+; CHECK-WACC-NEXT:    addi r1, r1, 112
+; CHECK-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-WACC-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; CHECK-WACC-NEXT:    mtlr r0
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testRedundantPrimeUnprime:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    mflr r0
+; CHECK-BE-WACC-NEXT:    std r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    stdu r1, -192(r1)
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    std r30, 176(r1) # 8-byte Folded Spill
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v1, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v0, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r3)
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
+; CHECK-BE-WACC-NEXT:    mr r30, r3
+; CHECK-BE-WACC-NEXT:    stxvp vsp36, 112(r1)
+; CHECK-BE-WACC-NEXT:    stxvp vsp34, 144(r1)
+; CHECK-BE-WACC-NEXT:    bl testRedundantPrimeUnprimeF
+; CHECK-BE-WACC-NEXT:    nop
+; CHECK-BE-WACC-NEXT:    lxvp vsp34, 112(r1)
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 144(r1)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 112(r30)
+; CHECK-BE-WACC-NEXT:    stxv v4, 96(r30)
+; CHECK-BE-WACC-NEXT:    stxv v3, 80(r30)
+; CHECK-BE-WACC-NEXT:    stxv v2, 64(r30)
+; CHECK-BE-WACC-NEXT:    ld r30, 176(r1) # 8-byte Folded Reload
+; CHECK-BE-WACC-NEXT:    addi r1, r1, 192
+; CHECK-BE-WACC-NEXT:    ld r0, 16(r1)
+; CHECK-BE-WACC-NEXT:    mtlr r0
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xxsetaccz()
   store <512 x i1> %0, ptr %dst, align 64
@@ -646,6 +1067,38 @@ define void @test_ldst_1(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    plxvp vsp36, 8(r4), 0
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    plxvp vsp36, 8(r4), 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = getelementptr i8, ptr %vpp, i64 8
@@ -688,6 +1141,38 @@ define void @test_ldst_2(ptr nocapture readonly %vqp, ptr %vpp, <16 x i8> %vc, p
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxvp vsp36, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 0(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
@@ -729,6 +1214,38 @@ define void @test_ldst_3(ptr nocapture readonly %vqp, i64 %offs, ptr %vpp, <16 x
 ; CHECK-BE-NEXT:    stxv vs3, 48(r9)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r9)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test_ldst_3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxvp vsp36, 0(r5)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r9)
+; CHECK-WACC-NEXT:    stxv v5, 32(r9)
+; CHECK-WACC-NEXT:    stxv v2, 16(r9)
+; CHECK-WACC-NEXT:    stxv v3, 0(r9)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test_ldst_3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 0(r5)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r9)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r9)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r9)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r9)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <256 x i1> @llvm.ppc.vsx.lxvp(ptr %vpp)
diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
index ac6ad41633492..ff860b8d6ff22 100644
--- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -enable-subreg-liveness -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-WACC
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -enable-subreg-liveness -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 declare <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
 declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
@@ -56,6 +62,46 @@ define void @intrinsics1(<16 x i8> %vc1, <16 x i8> %vc2, <16 x i8> %vc3, <16 x i
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: intrinsics1:
+; CHECK-WACC:       # %bb.0:
+; CHECK-WACC-NEXT:    vmr v1, v4
+; CHECK-WACC-NEXT:    vmr v4, v3
+; CHECK-WACC-NEXT:    vmr v0, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v4
+; CHECK-WACC-NEXT:    ld r3, 96(r1)
+; CHECK-WACC-NEXT:    xvf16ger2pp wacc0, v0, v1
+; CHECK-WACC-NEXT:    vmr v3, v2
+; CHECK-WACC-NEXT:    vmr v2, v5
+; CHECK-WACC-NEXT:    pmxvf32gerpn wacc0, v4, v5, 0, 0
+; CHECK-WACC-NEXT:    pmxvf64gernp wacc0, vsp34, v0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r3)
+; CHECK-WACC-NEXT:    stxv v5, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics1:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    vmr v1, v4
+; CHECK-BE-WACC-NEXT:    vmr v4, v3
+; CHECK-BE-WACC-NEXT:    vmr v0, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    ld r3, 112(r1)
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v0, v1
+; CHECK-BE-WACC-NEXT:    vmr v3, v2
+; CHECK-BE-WACC-NEXT:    vmr v2, v5
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpn wacc0, v4, v5, 0, 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernp wacc0, vsp34, v0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
   %1 = tail call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %vc1, <16 x i8> %vc3, <16 x i8> %vc2, <16 x i8> %vc4)
   %2 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %1, <16 x i8> %vc1, <16 x i8> %vc2)
   %3 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %2, <16 x i8> %vc1, <16 x i8> %vc3)
@@ -115,6 +161,46 @@ define void @intrinsics2(ptr %ptr1, ptr %ptr2, ptr %ptr3, ptr %ptr4, ptr %ptr) {
 ; CHECK-BE-NEXT:    stxv vs2, 0(r5)
 ; CHECK-BE-NEXT:    stxv vs3, 0(r6)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: intrinsics2:
+; CHECK-WACC:       # %bb.0:
+; CHECK-WACC-NEXT:    lxv v2, 0(r3)
+; CHECK-WACC-NEXT:    lxv v4, 0(r5)
+; CHECK-WACC-NEXT:    lxv v3, 0(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r6)
+; CHECK-WACC-NEXT:    vmr v1, v2
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-WACC-NEXT:    xvi8ger4pp wacc0, v2, v3
+; CHECK-WACC-NEXT:    xvf16ger2pn wacc0, v2, v4
+; CHECK-WACC-NEXT:    vmr v0, v5
+; CHECK-WACC-NEXT:    pmxvf32gernn wacc0, v3, v5, 0, 0
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp32, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 0(r4)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    stxv v2, 0(r6)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: intrinsics2:
+; CHECK-BE-WACC:       # %bb.0:
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    lxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    vmr v1, v2
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4pp wacc0, v2, v3
+; CHECK-BE-WACC-NEXT:    xvf16ger2pn wacc0, v2, v4
+; CHECK-BE-WACC-NEXT:    vmr v0, v5
+; CHECK-BE-WACC-NEXT:    pmxvf32gernn wacc0, v3, v5, 0, 0
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp32, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 0(r4)
+; CHECK-BE-WACC-NEXT:    stxv v4, 0(r5)
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r6)
+; CHECK-BE-WACC-NEXT:    blr
   %vc1 = load <16 x i8>, ptr %ptr1, align 16
   %vc2 = load <16 x i8>, ptr %ptr2, align 16
   %vc3 = load <16 x i8>, ptr %ptr3, align 16
@@ -157,6 +243,26 @@ define void @test1(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi4ger8 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi4ger8 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -196,6 +302,36 @@ define void @test2(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi4ger8pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -226,6 +362,26 @@ define void @test3(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test3:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi4ger8 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test3:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi4ger8 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -265,6 +421,36 @@ define void @test4(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test4:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test4:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi4ger8pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi4ger8pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -295,6 +481,26 @@ define void @test5(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test5:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi8ger4 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test5:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi8ger4 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -334,6 +540,36 @@ define void @test6(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test6:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi8ger4pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test6:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi8ger4pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -364,6 +600,26 @@ define void @test7(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test7:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi8ger4 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test7:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -403,6 +659,36 @@ define void @test8(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test8:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test8:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi8ger4pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi8ger4pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -433,6 +719,26 @@ define void @test9(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test9:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvi16ger2s wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test9:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvi16ger2s wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2s(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -472,6 +778,36 @@ define void @test10(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test10:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvi16ger2spp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test10:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvi16ger2spp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -502,6 +838,26 @@ define void @test11(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test11:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvi16ger2s wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test11:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2s wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2s(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -541,6 +897,36 @@ define void @test12(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test12:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test12:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvi16ger2spp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvi16ger2spp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -571,6 +957,26 @@ define void @test13(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test13:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvf16ger2 wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test13:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvf16ger2 wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -610,6 +1016,36 @@ define void @test14(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test14:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2pp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test14:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -650,6 +1086,36 @@ define void @test15(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test15:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2pn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test15:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2pn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -690,6 +1156,36 @@ define void @test16(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test16:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2np wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test16:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2np wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -730,6 +1226,36 @@ define void @test17(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test17:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf16ger2nn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test17:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf16ger2nn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -760,6 +1286,26 @@ define void @test18(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test18:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvf16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test18:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2 wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -799,6 +1345,36 @@ define void @test19(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test19:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test19:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2pp wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -839,6 +1415,36 @@ define void @test20(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test20:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test20:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2pn wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2pn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -879,6 +1485,36 @@ define void @test21(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test21:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2np wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test21:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2np wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2np(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -919,6 +1555,36 @@ define void @test22(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test22:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test22:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf16ger2nn wacc0, v2, v2, 0, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf16ger2nn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0, i32 0)
@@ -949,6 +1615,26 @@ define void @test23(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test23:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    xvf32ger wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test23:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    xvf32ger wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.xvf32ger(<16 x i8> %vc, <16 x i8> %vc)
   store <512 x i1> %0, ptr %resp, align 64
@@ -988,6 +1674,36 @@ define void @test24(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test24:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test24:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1028,6 +1744,36 @@ define void @test25(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test25:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test25:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gerpn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1068,6 +1814,36 @@ define void @test26(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test26:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test26:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1108,6 +1884,36 @@ define void @test27(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test27:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    xvf32gernn wacc0, v2, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test27:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    xvf32gernn wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc)
@@ -1138,6 +1944,26 @@ define void @test28(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test28:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    pmxvf32ger wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test28:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    pmxvf32ger wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32ger(<16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
   store <512 x i1> %0, ptr %resp, align 64
@@ -1177,6 +2003,36 @@ define void @test29(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test29:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gerpp wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test29:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpp wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1217,6 +2073,36 @@ define void @test30(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test30:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gerpn wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test30:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gerpn wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gerpn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1257,6 +2143,36 @@ define void @test31(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test31:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gernp wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test31:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gernp wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernp(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1297,6 +2213,36 @@ define void @test32(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test32:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    pmxvf32gernn wacc0, v2, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test32:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    pmxvf32gernn wacc0, v2, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf32gernn(<512 x i1> %0, <16 x i8> %vc, <16 x i8> %vc, i32 0, i32 0)
@@ -1331,6 +2277,30 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test33:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64ger wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test33:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64ger wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <256 x i1>, ptr %vpp, align 32
   %1 = tail call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> %0, <16 x i8> %vc)
@@ -1375,6 +2345,40 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test34:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test34:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1420,6 +2424,40 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test35:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gerpn wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test35:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gerpn wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1465,6 +2503,40 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test36:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test36:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1510,6 +2582,40 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test37:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    xvf64gernn wacc0, vsp36, v2
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test37:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    xvf64gernn wacc0, vsp36, v2
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1545,6 +2651,30 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test38:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64ger wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test38:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64ger wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <256 x i1>, ptr %vpp, align 32
   %1 = tail call <512 x i1> @llvm.ppc.mma.pmxvf64ger(<256 x i1> %0, <16 x i8> %vc, i32 0, i32 0)
@@ -1589,6 +2719,40 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test39:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gerpp wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test39:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gerpp wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1634,6 +2798,40 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test40:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gerpn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test40:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gerpn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1679,6 +2877,40 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test41:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gernp wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test41:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gernp wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
@@ -1724,6 +2956,40 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-NEXT:    stxv vs3, 48(r7)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r7)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: test42:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v5, 0(r3)
+; CHECK-WACC-NEXT:    lxv v1, 32(r3)
+; CHECK-WACC-NEXT:    lxv v4, 16(r3)
+; CHECK-WACC-NEXT:    lxv v0, 48(r3)
+; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-WACC-NEXT:    lxv v4, 16(r4)
+; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r7)
+; CHECK-WACC-NEXT:    stxv v5, 32(r7)
+; CHECK-WACC-NEXT:    stxv v2, 16(r7)
+; CHECK-WACC-NEXT:    stxv v3, 0(r7)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: test42:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
+; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
+; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
+; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp36, v2, 0, 0
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r7)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r7)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <512 x i1>, ptr %vqp, align 64
   %1 = load <256 x i1>, ptr %vpp, align 32
diff --git a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
index 89e5147aecc5f..37d0e69b3beaa 100644
--- a/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-phi-accs.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
 ; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
+; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s  --check-prefix=CHECK-WACC
+; RUN: llc -O3 -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=future -ppc-asm-full-reg-names \
+; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE-WACC
 
 declare <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8>, <16 x i8>)
 declare <512 x i1> @llvm.ppc.mma.xxsetaccz()
@@ -64,6 +70,60 @@ define void @testPHI1(ptr %Dst, ptr %Src, i32 signext %Len) {
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testPHI1:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmpwi r5, 3
+; CHECK-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-WACC-NEXT:    blt cr0, .LBB0_3
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-WACC-NEXT:    addi r5, r5, -2
+; CHECK-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-WACC-NEXT:    mtctr r5
+; CHECK-WACC-NEXT:    addi r4, r4, 32
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB0_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-WACC-NEXT:    addi r4, r4, 16
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    bdnz .LBB0_2
+; CHECK-WACC-NEXT:  .LBB0_3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 48(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testPHI1:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmpwi r5, 3
+; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB0_3
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-BE-WACC-NEXT:    addi r5, r5, -2
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-WACC-NEXT:    mtctr r5
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 32
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB0_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 16
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    bdnz .LBB0_2
+; CHECK-BE-WACC-NEXT:  .LBB0_3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <16 x i8>, ptr %Src, align 16
   %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
@@ -161,6 +221,62 @@ define dso_local void @testPHI2(ptr %Dst, ptr %Src, i32 signext %Len) {
 ; CHECK-BE-NEXT:    stxv vs2, 32(r3)
 ; CHECK-BE-NEXT:    stxv vs3, 48(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testPHI2:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-WACC-NEXT:    lxv vs0, 32(r4)
+; CHECK-WACC-NEXT:    cmpwi r5, 4
+; CHECK-WACC-NEXT:    xvf64ger wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    blt cr0, .LBB1_3
+; CHECK-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-WACC-NEXT:    addi r5, r5, -3
+; CHECK-WACC-NEXT:    mtctr r5
+; CHECK-WACC-NEXT:    addi r4, r4, 48
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB1_2: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-WACC-NEXT:    addi r4, r4, 16
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:    bdnz .LBB1_2
+; CHECK-WACC-NEXT:  .LBB1_3: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-WACC-NEXT:    stxv v4, 16(r3)
+; CHECK-WACC-NEXT:    stxv v3, 32(r3)
+; CHECK-WACC-NEXT:    stxv v2, 48(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testPHI2:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    lxv v2, 0(r4)
+; CHECK-BE-WACC-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxv vs0, 32(r4)
+; CHECK-BE-WACC-NEXT:    cmpwi r5, 4
+; CHECK-BE-WACC-NEXT:    xvf64ger wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB1_3
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    clrldi r5, r5, 32
+; CHECK-BE-WACC-NEXT:    addi r5, r5, -3
+; CHECK-BE-WACC-NEXT:    mtctr r5
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 48
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB1_2: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    lxv vs0, 0(r4)
+; CHECK-BE-WACC-NEXT:    addi r4, r4, 16
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:    bdnz .LBB1_2
+; CHECK-BE-WACC-NEXT:  .LBB1_3: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r3)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r3)
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r3)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %0 = load <16 x i8>, ptr %Src, align 16
   %arrayidx1 = getelementptr inbounds <16 x i8>, ptr %Src, i64 1
@@ -229,6 +345,28 @@ define void @testImplicitDef(ptr %ptr) {
 ; CHECK-BE-NEXT:    xxmfacc acc0
 ; CHECK-BE-NEXT:    stxv vs3, 0(r3)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testImplicitDef:
+; CHECK-WACC:       # %bb.0: # %label1
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    bc 12, 4*cr5+lt, .LBB2_2
+; CHECK-WACC-NEXT:  # %bb.1: # %label2
+; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-WACC-NEXT:  .LBB2_2: # %label3
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    stxv v2, 0(r3)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testImplicitDef:
+; CHECK-BE-WACC:       # %bb.0: # %label1
+; CHECK-BE-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-BE-WACC-NEXT:    bc 12, 4*cr5+lt, .LBB2_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %label2
+; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp34, vs0
+; CHECK-BE-WACC-NEXT:  .LBB2_2: # %label3
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 0(r3)
+; CHECK-BE-WACC-NEXT:    blr
 label1:
   br i1 undef, label %label3, label %label2
 
@@ -312,6 +450,70 @@ define dso_local signext i32 @testNestedPHI(i32 signext %cond, i32 signext %coun
 ; CHECK-BE-NEXT:    stxv vs3, 48(r5)
 ; CHECK-BE-NEXT:    stxv vs2, 32(r5)
 ; CHECK-BE-NEXT:    blr
+;
+; CHECK-WACC-LABEL: testNestedPHI:
+; CHECK-WACC:       # %bb.0: # %entry
+; CHECK-WACC-NEXT:    cmplwi r3, 0
+; CHECK-WACC-NEXT:    beq cr0, .LBB3_2
+; CHECK-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    bge cr0, .LBB3_3
+; CHECK-WACC-NEXT:    b .LBB3_5
+; CHECK-WACC-NEXT:  .LBB3_2:
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    cmpwi r4, 1
+; CHECK-WACC-NEXT:    blt cr0, .LBB3_5
+; CHECK-WACC-NEXT:  .LBB3_3: # %for.body.preheader
+; CHECK-WACC-NEXT:    addi r3, r4, -1
+; CHECK-WACC-NEXT:    clrldi r3, r3, 32
+; CHECK-WACC-NEXT:    addi r3, r3, 1
+; CHECK-WACC-NEXT:    mtctr r3
+; CHECK-WACC-NEXT:    .p2align 4
+; CHECK-WACC-NEXT:  .LBB3_4: # %for.body
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-WACC-NEXT:    bdnz .LBB3_4
+; CHECK-WACC-NEXT:  .LBB3_5: # %for.cond.cleanup
+; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-WACC-NEXT:    li r3, 0
+; CHECK-WACC-NEXT:    stxv v4, 48(r5)
+; CHECK-WACC-NEXT:    stxv v5, 32(r5)
+; CHECK-WACC-NEXT:    stxv v2, 16(r5)
+; CHECK-WACC-NEXT:    stxv v3, 0(r5)
+; CHECK-WACC-NEXT:    blr
+;
+; CHECK-BE-WACC-LABEL: testNestedPHI:
+; CHECK-BE-WACC:       # %bb.0: # %entry
+; CHECK-BE-WACC-NEXT:    cmplwi r3, 0
+; CHECK-BE-WACC-NEXT:    beq cr0, .LBB3_2
+; CHECK-BE-WACC-NEXT:  # %bb.1: # %if.then
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    bge cr0, .LBB3_3
+; CHECK-BE-WACC-NEXT:    b .LBB3_5
+; CHECK-BE-WACC-NEXT:  .LBB3_2:
+; CHECK-BE-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-BE-WACC-NEXT:    cmpwi r4, 1
+; CHECK-BE-WACC-NEXT:    blt cr0, .LBB3_5
+; CHECK-BE-WACC-NEXT:  .LBB3_3: # %for.body.preheader
+; CHECK-BE-WACC-NEXT:    addi r3, r4, -1
+; CHECK-BE-WACC-NEXT:    clrldi r3, r3, 32
+; CHECK-BE-WACC-NEXT:    addi r3, r3, 1
+; CHECK-BE-WACC-NEXT:    mtctr r3
+; CHECK-BE-WACC-NEXT:    .p2align 4
+; CHECK-BE-WACC-NEXT:  .LBB3_4: # %for.body
+; CHECK-BE-WACC-NEXT:    #
+; CHECK-BE-WACC-NEXT:    xvf32gernp wacc0, v2, v2
+; CHECK-BE-WACC-NEXT:    bdnz .LBB3_4
+; CHECK-BE-WACC-NEXT:  .LBB3_5: # %for.cond.cleanup
+; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-WACC-NEXT:    li r3, 0
+; CHECK-BE-WACC-NEXT:    stxv v5, 48(r5)
+; CHECK-BE-WACC-NEXT:    stxv v4, 32(r5)
+; CHECK-BE-WACC-NEXT:    stxv v3, 16(r5)
+; CHECK-BE-WACC-NEXT:    stxv v2, 0(r5)
+; CHECK-BE-WACC-NEXT:    blr
 entry:
   %tobool.not = icmp eq i32 %cond, 0
   br i1 %tobool.not, label %if.end, label %if.then
diff --git a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
index 291cf97fd009e..929bf5f61dd90 100644
--- a/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
+++ b/llvm/test/CodeGen/PowerPC/peephole-mma-phi-liveness.ll
@@ -1,5 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -verify-machineinstrs -mcpu=ppc -mtriple=powerpc64-ibm-aix < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=future \
+; RUN:   -mtriple=powerpc64-ibm-aix < %s | FileCheck %s --check-prefix=CHECK-WACC
 
 target datalayout = "E-m:a-Fi64-i64:64-n32:64-S128-v256:256:256-v512:512:512"
 
@@ -38,6 +40,43 @@ define void @baz(i64 %arg) local_unnamed_addr #0 {
 ; CHECK-NEXT:    xxswapd 0, 0
 ; CHECK-NEXT:    stxv 0, 0(3)
 ; CHECK-NEXT:    blr
+;
+; CHECK-WACC-LABEL: baz:
+; CHECK-WACC:       # %bb.0: # %bb
+; CHECK-WACC-NEXT:    dmxxextfdmr512 34, 36, 0, 0
+; CHECK-WACC-NEXT:    xxmrgld 1, 34, 36
+; CHECK-WACC-NEXT:    xxswapd 2, 1
+; CHECK-WACC-NEXT:    xxlxor 0, 0, 0
+; CHECK-WACC-NEXT:    xvnegdp 1, 1
+; CHECK-WACC-NEXT:    xvnegdp 2, 2
+; CHECK-WACC-NEXT:    xvsubdp 1, 1, 0
+; CHECK-WACC-NEXT:    xvsubdp 2, 2, 37
+; CHECK-WACC-NEXT:    xvmuldp 1, 1, 0
+; CHECK-WACC-NEXT:    xvmuldp 2, 2, 0
+; CHECK-WACC-NEXT:    xvmaddadp 1, 0, 0
+; CHECK-WACC-NEXT:    xvmaddadp 2, 0, 0
+; CHECK-WACC-NEXT:    stxv 1, 0(3)
+; CHECK-WACC-NEXT:    stxv 2, 0(3)
+; CHECK-WACC-NEXT:    # implicit-def: $wacc0
+; CHECK-WACC-NEXT:    bc 12, 20, L..BB0_2
+; CHECK-WACC-NEXT:  # %bb.1: # %bb10
+; CHECK-WACC-NEXT:    xvf64gerpp 0, 34, 0
+; CHECK-WACC-NEXT:  L..BB0_2: # %bb12
+; CHECK-WACC-NEXT:    cmpdi 3, 0
+; CHECK-WACC-NEXT:    .align 4
+; CHECK-WACC-NEXT:  L..BB0_3: # %bb13
+; CHECK-WACC-NEXT:    #
+; CHECK-WACC-NEXT:    bc 4, 2, L..BB0_3
+; CHECK-WACC-NEXT:  # %bb.4: # %bb14
+; CHECK-WACC-NEXT:    dmxxextfdmr512 34, 36, 0, 0
+; CHECK-WACC-NEXT:    xxlxor 0, 0, 0
+; CHECK-WACC-NEXT:    xvsubdp 1, 0, 35
+; CHECK-WACC-NEXT:    xxlxor 2, 2, 2
+; CHECK-WACC-NEXT:    xvmaddadp 2, 1, 2
+; CHECK-WACC-NEXT:    xvadddp 0, 2, 0
+; CHECK-WACC-NEXT:    xxswapd 0, 0
+; CHECK-WACC-NEXT:    stxv 0, 0(3)
+; CHECK-WACC-NEXT:    blr
 bb:
   %call = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> poison)
   %extractvalue = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %call, 0
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
new file mode 100644
index 0000000000000..5cb55f15c7c8c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
new file mode 100644
index 0000000000000..fafd45b7579e8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
@@ -0,0 +1,5100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> poison,
+    ptr %0,
+    <vscale x 64 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
new file mode 100644
index 0000000000000..916af2556c6a8
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
new file mode 100644
index 0000000000000..8dd32a1d640dc
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
@@ -0,0 +1,5100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> poison,
+    ptr %0,
+    <vscale x 64 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
new file mode 100644
index 0000000000000..4963d91a14988
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
@@ -0,0 +1,1293 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
new file mode 100644
index 0000000000000..7ea2e1734e5a2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
@@ -0,0 +1,4881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
new file mode 100644
index 0000000000000..9bd272a368d20
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
@@ -0,0 +1,1310 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+define void @intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> splat (i1 true),
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll
new file mode 100644
index 0000000000000..7cd15454d40b9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll
@@ -0,0 +1,4881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 988d0490afeb6..cf44af608542c 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -137,6 +137,7 @@
 ; CHECK-NEXT:   shifted-zextw-fusion             - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension.
 ; CHECK-NEXT:   shlcofideleg                     - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode).
 ; CHECK-NEXT:   short-forward-branch-i-minmax    - Enable short forward branch optimization for min,max instructions in Zbb.
+; CHECK-NEXT:   short-forward-branch-i-mul       - Enable short forward branch optimization for mul instruction.
 ; CHECK-NEXT:   short-forward-branch-opt         - Enable short forward branch optimization.
 ; CHECK-NEXT:   shtvala                          - 'Shtvala' (htval provides all needed values).
 ; CHECK-NEXT:   shvsatpa                         - 'Shvsatpa' (vsatp supports all modes supported by satp).
diff --git a/llvm/test/CodeGen/RISCV/mask-variable-shift.ll b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
new file mode 100644
index 0000000000000..4e73cee30ef08
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/mask-variable-shift.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv64-none-elf -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=RV64
+
+define i32 @mask_pair(i32 %x, i32 %y) {
+; RV32-LABEL: mask_pair:
+; RV32:       # %bb.0:
+; RV32-NEXT:    srl a0, a0, a1
+; RV32-NEXT:    sll a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srlw a0, a0, a1
+; RV64-NEXT:    sllw a0, a0, a1
+; RV64-NEXT:    ret
+  %shl = shl nsw i32 -1, %y
+  %and = and i32 %shl, %x
+  ret i32 %and
+}
+
+define i64 @mask_pair_64(i64 %x, i64 %y) {
+; RV32-LABEL: mask_pair_64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a3, -1
+; RV32-NEXT:    addi a4, a2, -32
+; RV32-NEXT:    sll a3, a3, a2
+; RV32-NEXT:    bltz a4, .LBB1_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:    j .LBB1_3
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    not a2, a2
+; RV32-NEXT:    lui a5, 524288
+; RV32-NEXT:    addi a5, a5, -1
+; RV32-NEXT:    srl a2, a5, a2
+; RV32-NEXT:    or a2, a3, a2
+; RV32-NEXT:  .LBB1_3:
+; RV32-NEXT:    srai a4, a4, 31
+; RV32-NEXT:    and a3, a4, a3
+; RV32-NEXT:    and a1, a2, a1
+; RV32-NEXT:    and a0, a3, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair_64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    srl a0, a0, a1
+; RV64-NEXT:    sll a0, a0, a1
+; RV64-NEXT:    ret
+  %shl = shl nsw i64 -1, %y
+  %and = and i64 %shl, %x
+  ret i64 %and
+}
+
+define i128 @mask_pair_128(i128 %x, i128 %y) {
+; RV32-LABEL: mask_pair_128:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -32
+; RV32-NEXT:    .cfi_def_cfa_offset 32
+; RV32-NEXT:    lw a5, 0(a1)
+; RV32-NEXT:    lw a4, 4(a1)
+; RV32-NEXT:    lw a3, 8(a1)
+; RV32-NEXT:    lw a1, 12(a1)
+; RV32-NEXT:    lw a2, 0(a2)
+; RV32-NEXT:    li a6, -1
+; RV32-NEXT:    sw zero, 0(sp)
+; RV32-NEXT:    sw zero, 4(sp)
+; RV32-NEXT:    sw zero, 8(sp)
+; RV32-NEXT:    sw zero, 12(sp)
+; RV32-NEXT:    addi a7, sp, 16
+; RV32-NEXT:    sw a6, 16(sp)
+; RV32-NEXT:    sw a6, 20(sp)
+; RV32-NEXT:    sw a6, 24(sp)
+; RV32-NEXT:    sw a6, 28(sp)
+; RV32-NEXT:    srli a6, a2, 3
+; RV32-NEXT:    andi a6, a6, 12
+; RV32-NEXT:    sub a6, a7, a6
+; RV32-NEXT:    lw a7, 4(a6)
+; RV32-NEXT:    lw t0, 8(a6)
+; RV32-NEXT:    lw t1, 12(a6)
+; RV32-NEXT:    lw a6, 0(a6)
+; RV32-NEXT:    andi t2, a2, 31
+; RV32-NEXT:    xori t2, t2, 31
+; RV32-NEXT:    sll t1, t1, a2
+; RV32-NEXT:    srli t3, t0, 1
+; RV32-NEXT:    sll t0, t0, a2
+; RV32-NEXT:    srli t4, a7, 1
+; RV32-NEXT:    sll a7, a7, a2
+; RV32-NEXT:    sll a2, a6, a2
+; RV32-NEXT:    srli a6, a6, 1
+; RV32-NEXT:    srl t3, t3, t2
+; RV32-NEXT:    srl t4, t4, t2
+; RV32-NEXT:    srl a6, a6, t2
+; RV32-NEXT:    and a2, a2, a5
+; RV32-NEXT:    or a5, t1, t3
+; RV32-NEXT:    or t0, t0, t4
+; RV32-NEXT:    or a6, a7, a6
+; RV32-NEXT:    and a4, a6, a4
+; RV32-NEXT:    and a3, t0, a3
+; RV32-NEXT:    and a1, a5, a1
+; RV32-NEXT:    sw a2, 0(a0)
+; RV32-NEXT:    sw a4, 4(a0)
+; RV32-NEXT:    sw a3, 8(a0)
+; RV32-NEXT:    sw a1, 12(a0)
+; RV32-NEXT:    addi sp, sp, 32
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: mask_pair_128:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a5, -1
+; RV64-NEXT:    addi a4, a2, -64
+; RV64-NEXT:    sll a3, a5, a2
+; RV64-NEXT:    bltz a4, .LBB2_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:    j .LBB2_3
+; RV64-NEXT:  .LBB2_2:
+; RV64-NEXT:    not a2, a2
+; RV64-NEXT:    srli a5, a5, 1
+; RV64-NEXT:    srl a2, a5, a2
+; RV64-NEXT:    or a2, a3, a2
+; RV64-NEXT:  .LBB2_3:
+; RV64-NEXT:    srai a4, a4, 63
+; RV64-NEXT:    and a3, a4, a3
+; RV64-NEXT:    and a1, a2, a1
+; RV64-NEXT:    and a0, a3, a0
+; RV64-NEXT:    ret
+  %shl = shl nsw i128 -1, %y
+  %and = and i128 %shl, %x
+  ret i128 %and
+}
diff --git a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
index c489bc3681876..aa63552eb4b63 100644
--- a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
@@ -488,5 +488,5 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
 ;.
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
new file mode 100644
index 0000000000000..bf0a2e5e35a01
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh < %s | FileCheck %s
+
+; CHECK-LABEL:  .section	.llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   0
+; Num Functions
+; CHECK-NEXT:   .word   1
+; Num LargeConstants
+; CHECK-NEXT:   .word   0
+; Num Callsites
+; CHECK-NEXT:   .word   1
+
+; Functions and stack size
+; CHECK-NEXT:   .quad   liveArgs
+; CHECK-NEXT:   .quad   0
+; CHECK-NEXT:   .quad   1
+
+; Spilled stack map values.
+;
+; Verify 3 stack map entries.
+;
+; CHECK-LABEL:  .word   .L{{.*}}-liveArgs
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   25
+;
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
+; CHECK:        .byte   3
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+define void @liveArgs(double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) {
+entry:
+  call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index c50a0fb3ffe91..320a3aa94cd7d 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -286,8 +286,8 @@ define void @liveConstant() {
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .half   28
 ;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect RBP + ...
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
 ; CHECK:        .byte   3
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
@@ -307,7 +307,7 @@ entry:
 ; CHECK-NEXT:   .half   0
 ; 1 location
 ; CHECK-NEXT:   .half   1
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
@@ -320,14 +320,14 @@ entry:
 ; CHECK-NEXT:   .half   0
 ; 2 locations
 ; CHECK-NEXT:   .half   2
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
 ; CHECK-NEXT:   .half   2
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .word
-; Loc 1: Direct RBP - ofs
+; Loc 1: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
new file mode 100644
index 0000000000000..3f780fddafcce
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RV32I-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RV64I-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBIMul-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBIMul-M
+
+define i32 @select_example_mul_i32(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-M-LABEL: select_example_mul_i32:
+; RV32I-M:       # %bb.0: # %entry
+; RV32I-M-NEXT:    beqz a2, .LBB0_2
+; RV32I-M-NEXT:  # %bb.1:
+; RV32I-M-NEXT:    mul a1, a0, a3
+; RV32I-M-NEXT:  .LBB0_2: # %entry
+; RV32I-M-NEXT:    mv a0, a1
+; RV32I-M-NEXT:    ret
+;
+; RV64I-M-LABEL: select_example_mul_i32:
+; RV64I-M:       # %bb.0: # %entry
+; RV64I-M-NEXT:    beqz a2, .LBB0_2
+; RV64I-M-NEXT:  # %bb.1:
+; RV64I-M-NEXT:    mulw a1, a0, a3
+; RV64I-M-NEXT:  .LBB0_2: # %entry
+; RV64I-M-NEXT:    mv a0, a1
+; RV64I-M-NEXT:    ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i32:
+; RV32I-SFB-M:       # %bb.0: # %entry
+; RV32I-SFB-M-NEXT:    mul a0, a0, a3
+; RV32I-SFB-M-NEXT:    bnez a2, .LBB0_2
+; RV32I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-M-NEXT:    mv a0, a1
+; RV32I-SFB-M-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-M-NEXT:    ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i32:
+; RV64I-SFB-M:       # %bb.0: # %entry
+; RV64I-SFB-M-NEXT:    mulw a0, a0, a3
+; RV64I-SFB-M-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-M-NEXT:    mv a0, a1
+; RV64I-SFB-M-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-M-NEXT:    ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV32I-SFBIMul-M:       # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT:    beqz a2, .LBB0_2
+; RV32I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a1, a0, a3
+; RV32I-SFBIMul-M-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBIMul-M-NEXT:    mv a0, a1
+; RV32I-SFBIMul-M-NEXT:    ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV64I-SFBIMul-M:       # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT:    mulw a0, a0, a3
+; RV64I-SFBIMul-M-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT:    mv a0, a1
+; RV64I-SFBIMul-M-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBIMul-M-NEXT:    ret
+entry:
+  %res = mul i32 %a, %y
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i64 @select_example_mul_i64(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-M-LABEL: select_example_mul_i64:
+; RV32I-M:       # %bb.0: # %entry
+; RV32I-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-M-NEXT:  # %bb.1:
+; RV32I-M-NEXT:    mul a2, a0, a6
+; RV32I-M-NEXT:    mulhu a3, a0, a5
+; RV32I-M-NEXT:    mul a1, a1, a5
+; RV32I-M-NEXT:    add a2, a3, a2
+; RV32I-M-NEXT:    add a3, a2, a1
+; RV32I-M-NEXT:    mul a2, a0, a5
+; RV32I-M-NEXT:  .LBB1_2: # %entry
+; RV32I-M-NEXT:    mv a0, a2
+; RV32I-M-NEXT:    mv a1, a3
+; RV32I-M-NEXT:    ret
+;
+; RV64I-M-LABEL: select_example_mul_i64:
+; RV64I-M:       # %bb.0: # %entry
+; RV64I-M-NEXT:    beqz a2, .LBB1_2
+; RV64I-M-NEXT:  # %bb.1:
+; RV64I-M-NEXT:    mul a1, a0, a3
+; RV64I-M-NEXT:  .LBB1_2: # %entry
+; RV64I-M-NEXT:    mv a0, a1
+; RV64I-M-NEXT:    ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i64:
+; RV32I-SFB-M:       # %bb.0: # %entry
+; RV32I-SFB-M-NEXT:    mul a6, a0, a6
+; RV32I-SFB-M-NEXT:    mulhu a7, a0, a5
+; RV32I-SFB-M-NEXT:    mul a1, a1, a5
+; RV32I-SFB-M-NEXT:    mul a0, a0, a5
+; RV32I-SFB-M-NEXT:    add a6, a7, a6
+; RV32I-SFB-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-M-NEXT:    add a3, a6, a1
+; RV32I-SFB-M-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-M-NEXT:    bnez a4, .LBB1_4
+; RV32I-SFB-M-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-M-NEXT:    mv a0, a2
+; RV32I-SFB-M-NEXT:  .LBB1_4: # %entry
+; RV32I-SFB-M-NEXT:    mv a1, a3
+; RV32I-SFB-M-NEXT:    ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i64:
+; RV64I-SFB-M:       # %bb.0: # %entry
+; RV64I-SFB-M-NEXT:    mul a0, a0, a3
+; RV64I-SFB-M-NEXT:    bnez a2, .LBB1_2
+; RV64I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-M-NEXT:    mv a0, a1
+; RV64I-SFB-M-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-M-NEXT:    ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV32I-SFBIMul-M:       # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a6, a0, a6
+; RV32I-SFBIMul-M-NEXT:    mulhu a7, a0, a5
+; RV32I-SFBIMul-M-NEXT:    mul a1, a1, a5
+; RV32I-SFBIMul-M-NEXT:    add a6, a7, a6
+; RV32I-SFBIMul-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT:    add a3, a6, a1
+; RV32I-SFBIMul-M-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBIMul-M-NEXT:    beqz a4, .LBB1_4
+; RV32I-SFBIMul-M-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a2, a0, a5
+; RV32I-SFBIMul-M-NEXT:  .LBB1_4: # %entry
+; RV32I-SFBIMul-M-NEXT:    mv a0, a2
+; RV32I-SFBIMul-M-NEXT:    mv a1, a3
+; RV32I-SFBIMul-M-NEXT:    ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV64I-SFBIMul-M:       # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT:    beqz a2, .LBB1_2
+; RV64I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT:    mul a1, a0, a3
+; RV64I-SFBIMul-M-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBIMul-M-NEXT:    mv a0, a1
+; RV64I-SFBIMul-M-NEXT:    ret
+entry:
+  %res = mul i64 %a, %y
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
index ec4884ff643cb..3e0d0cc4cd8e2 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-type-pointer.ll
@@ -1,7 +1,9 @@
 ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-unknown-unknown %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
+; RUN: llc --verify-machineinstrs --print-after=spirv-nonsemantic-debug-info -O0 -mtriple=spirv64-amd-amdhsa %s -o - 2>&1 | FileCheck %s --check-prefix=CHECK-MIR
 ; RUN: llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: llc --verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_non_semantic_info %s -o - | FileCheck %s --check-prefix=CHECK-OPTION
-; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer.   
+; TODO(#109287): When type is void * the spirv-val raises an error when DebugInfoNone is set as <id> Base Type argument of DebugTypePointer.
 ; DISABLED: %if spirv-tools %{ llc --verify-machineinstrs --spv-emit-nonsemantic-debug-info --spirv-ext=+SPV_KHR_non_semantic_info -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-MIR-DAG:   [[i32type:%[0-9]+\:type]] = OpTypeInt 32, 0
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
index b1a555a52f40d..6b4e35e997124 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_EXT_optnone.ll
@@ -7,6 +7,8 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_EXT_optnone,+SPV_INTEL_optnone %s -o - | FileCheck %s --check-prefixes=CHECK-TWO-EXTENSIONS
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS
 
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s --check-prefixes=CHECK-ALL-EXTENSIONS
+
 ; CHECK-EXTENSION: OpCapability OptNoneEXT
 ; CHECK-EXTENSION: OpExtension "SPV_EXT_optnone"
 ; CHECK-NO-EXTENSION-NOT: OpCapability OptNoneINTEL
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
new file mode 100644
index 0000000000000..4cabddb94df25
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-arithmetic.ll
@@ -0,0 +1,142 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: Arithmetic instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic
+
+; CHECK-DAG: OpCapability BFloat16TypeKHR
+; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL
+; CHECK-DAG: OpExtension "SPV_KHR_bfloat16"
+; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic"
+; CHECK-DAG: OpName [[NEG:%.*]] "neg"
+; CHECK-DAG: OpName [[NEGV:%.*]] "negv"
+; CHECK-DAG: OpName [[ADD:%.*]] "add"
+; CHECK-DAG: OpName [[ADDV:%.*]] "addv"
+; CHECK-DAG: OpName [[SUB:%.*]] "sub"
+; CHECK-DAG: OpName [[SUBV:%.*]] "subv"
+; CHECK-DAG: OpName [[MUL:%.*]] "mul"
+; CHECK-DAG: OpName [[MULV:%.*]] "mulv"
+; CHECK-DAG: OpName [[DIV:%.*]] "div"
+; CHECK-DAG: OpName [[DIVV:%.*]] "divv"
+; CHECK-DAG: OpName [[REM:%.*]] "rem"
+; CHECK-DAG: OpName [[REMV:%.*]] "remv"
+; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0
+; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 4
+
+; CHECK-DAG: [[NEG]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOAT]] [[X]]
+define spir_func bfloat @neg(bfloat %x) {
+entry:
+  %r = fneg bfloat %x
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[NEGV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFNegate [[BFLOATV]] [[X]]
+define spir_func <4 x bfloat> @negv(<4 x bfloat> %x) {
+entry:
+  %r = fneg <4 x bfloat> %x
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[ADD]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @add(bfloat %x, bfloat %y) {
+entry:
+  %r = fadd bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[ADDV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFAdd [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @addv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fadd <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[SUB]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @sub(bfloat %x, bfloat %y) {
+entry:
+  %r = fsub bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[SUBV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFSub [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @subv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fsub <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[MUL]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @mul(bfloat %x, bfloat %y) {
+entry:
+  %r = fmul bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[MULV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFMul [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @mulv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fmul <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[DIV]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @div(bfloat %x, bfloat %y) {
+entry:
+  %r = fdiv bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[DIVV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFDiv [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @divv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = fdiv <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
+
+; CHECK-DAG: [[REM]] = OpFunction [[BFLOAT]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOAT]] [[X]] [[Y]]
+define spir_func bfloat @rem(bfloat %x, bfloat %y) {
+entry:
+  %r = frem bfloat %x, %y
+  ret bfloat %r
+}
+
+; CHECK-DAG: [[REMV]] = OpFunction [[BFLOATV]]
+; CHECK: [[X:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK: [[Y:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-DAG: [[R:%.*]] = OpFRem [[BFLOATV]] [[X]] [[Y]]
+define spir_func <4 x bfloat> @remv(<4 x bfloat> %x, <4 x bfloat> %y) {
+entry:
+  %r = frem <4 x bfloat> %x, %y
+  ret <4 x bfloat> %r
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll
new file mode 100644
index 0000000000000..3774791d58f87
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_bfloat16_arithmetic/bfloat16-relational.ll
@@ -0,0 +1,376 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_KHR_bfloat16 %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_bfloat16_arithmetic,+SPV_KHR_bfloat16 %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: Relational instructions with bfloat16 arguments require the following SPIR-V extension: SPV_INTEL_bfloat16_arithmetic
+
+; CHECK-DAG: OpCapability BFloat16TypeKHR
+; CHECK-DAG: OpCapability BFloat16ArithmeticINTEL
+; CHECK-DAG: OpExtension "SPV_KHR_bfloat16"
+; CHECK-DAG: OpExtension "SPV_INTEL_bfloat16_arithmetic"
+; CHECK-DAG: OpName [[UEQ:%.*]] "test_ueq"
+; CHECK-DAG: OpName [[OEQ:%.*]] "test_oeq"
+; CHECK-DAG: OpName [[UNE:%.*]] "test_une"
+; CHECK-DAG: OpName [[ONE:%.*]] "test_one"
+; CHECK-DAG: OpName [[ULT:%.*]] "test_ult"
+; CHECK-DAG: OpName [[OLT:%.*]] "test_olt"
+; CHECK-DAG: OpName [[ULE:%.*]] "test_ule"
+; CHECK-DAG: OpName [[OLE:%.*]] "test_ole"
+; CHECK-DAG: OpName [[UGT:%.*]] "test_ugt"
+; CHECK-DAG: OpName [[OGT:%.*]] "test_ogt"
+; CHECK-DAG: OpName [[UGE:%.*]] "test_uge"
+; CHECK-DAG: OpName [[OGE:%.*]] "test_oge"
+; CHECK-DAG: OpName [[UNO:%.*]] "test_uno"
+; CHECK-DAG: OpName [[ORD:%.*]] "test_ord"
+; CHECK-DAG: OpName [[v3UEQ:%.*]] "test_v3_ueq"
+; CHECK-DAG: OpName [[v3OEQ:%.*]] "test_v3_oeq"
+; CHECK-DAG: OpName [[v3UNE:%.*]] "test_v3_une"
+; CHECK-DAG: OpName [[v3ONE:%.*]] "test_v3_one"
+; CHECK-DAG: OpName [[v3ULT:%.*]] "test_v3_ult"
+; CHECK-DAG: OpName [[v3OLT:%.*]] "test_v3_olt"
+; CHECK-DAG: OpName [[v3ULE:%.*]] "test_v3_ule"
+; CHECK-DAG: OpName [[v3OLE:%.*]] "test_v3_ole"
+; CHECK-DAG: OpName [[v3UGT:%.*]] "test_v3_ugt"
+; CHECK-DAG: OpName [[v3OGT:%.*]] "test_v3_ogt"
+; CHECK-DAG: OpName [[v3UGE:%.*]] "test_v3_uge"
+; CHECK-DAG: OpName [[v3OGE:%.*]] "test_v3_oge"
+; CHECK-DAG: OpName [[v3UNO:%.*]] "test_v3_uno"
+; CHECK-DAG: OpName [[v3ORD:%.*]] "test_v3_ord"
+; CHECK: [[BFLOAT:%.*]] = OpTypeFloat 16 0
+; CHECK: [[BFLOATV:%.*]] = OpTypeVector [[BFLOAT]] 3
+
+; CHECK:      [[UEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ueq(bfloat %a, bfloat %b) {
+  %r = fcmp ueq bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_oeq(bfloat %a, bfloat %b) {
+  %r = fcmp oeq bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UNE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_une(bfloat %a, bfloat %b) {
+  %r = fcmp une bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ONE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_one(bfloat %a, bfloat %b) {
+  %r = fcmp one bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ULT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ult(bfloat %a, bfloat %b) {
+  %r = fcmp ult bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OLT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_olt(bfloat %a, bfloat %b) {
+  %r = fcmp olt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ULE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ule(bfloat %a, bfloat %b) {
+  %r = fcmp ule bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OLE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ole(bfloat %a, bfloat %b) {
+  %r = fcmp ole bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ugt(bfloat %a, bfloat %b) {
+  %r = fcmp ugt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ogt(bfloat %a, bfloat %b) {
+  %r = fcmp ogt bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_uge(bfloat %a, bfloat %b) {
+  %r = fcmp uge bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[OGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_oge(bfloat %a, bfloat %b) {
+  %r = fcmp oge bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[ORD]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_ord(bfloat %a, bfloat %b) {
+  %r = fcmp ord bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[UNO]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOAT]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define i1 @test_uno(bfloat %a, bfloat %b) {
+  %r = fcmp uno bfloat %a, %b
+  ret i1 %r
+}
+
+; CHECK:      [[v3UEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ueq(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ueq <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OEQ]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_oeq(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp oeq <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UNE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_une(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp une <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ONE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdNotEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_one(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp one <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ULT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ult(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ult <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OLT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_olt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp olt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ULE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ule(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ule <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OLE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdLessThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ole(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ole <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ugt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ugt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OGT]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThan {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ogt(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ogt <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFUnordGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_uge(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp uge <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3OGE]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpFOrdGreaterThanEqual {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_oge(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp oge <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3ORD]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpOrdered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_ord(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp ord <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
+
+; CHECK:      [[v3UNO]] = OpFunction
+; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: [[B:%.*]] = OpFunctionParameter [[BFLOATV]]
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpUnordered {{%.+}} [[A]] [[B]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define <3 x i1> @test_v3_uno(<3 x bfloat> %a, <3 x bfloat> %b) {
+  %r = fcmp uno <3 x bfloat> %a, %b
+  ret <3 x i1> %r
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll
new file mode 100644
index 0000000000000..717771c965496
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_kernel_attributes/max_work_group_size.ll
@@ -0,0 +1,32 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
+; %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_kernel_attributes %s -o - -filetype=obj | spirv-val %}
+; %if spirv-tools %{ llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpCapability KernelAttributesINTEL
+; CHECK: OpExtension "SPV_INTEL_kernel_attributes"
+; CHECK: OpEntryPoint {{.*}} %[[DIM1:[0-9]+]] "Dim1"
+; CHECK: OpEntryPoint {{.*}} %[[DIM2:[0-9]+]] "Dim2"
+; CHECK: OpEntryPoint {{.*}} %[[DIM3:[0-9]+]] "Dim3"
+; CHECK: OpExecutionMode %[[DIM1]] MaxWorkgroupSizeINTEL 4 1 1
+; CHECK: OpExecutionMode %[[DIM2]] MaxWorkgroupSizeINTEL 8 4 1
+; CHECK: OpExecutionMode %[[DIM3]] MaxWorkgroupSizeINTEL 16 8 4
+; CHECK: %[[DIM1]] = OpFunction
+; CHECK: %[[DIM2]] = OpFunction
+; CHECK: %[[DIM3]] = OpFunction
+
+define spir_kernel void @Dim1() !max_work_group_size !0 {
+  ret void
+}
+
+define spir_kernel void @Dim2() !max_work_group_size !1 {
+  ret void
+}
+
+define spir_kernel void @Dim3() !max_work_group_size !2 {
+  ret void
+}
+
+!0 = !{i32 4}
+!1 = !{i32 8, i32 4}
+!2 = !{i32 16, i32 8, i32 4}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
index f745794e11de1..15905dd1894e2 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-amd-amdhsa %s -o - | FileCheck %s
 
 define i6 @getConstantI6() {
   ret i6 2
diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
index afffd9e69b454..11e7d006c5ecf 100644
--- a/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
+++ b/llvm/test/CodeGen/SPIRV/physical-layout/generator-magic-number.ll
@@ -1,4 +1,6 @@
 ; REQUIRES: spirv-tools
 ; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck --check-prefix=AMDGCNSPIRV %s
 
 ; CHECK: Generator: {{.*}}{{43|LLVM SPIR-V Backend}}{{.*}}
+; AMDGCNSPIRV: Generator: {{.*}}{{65535|LLVM SPIR-V Backend}}{{.*}}
diff --git a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
index 686c1e97257ad..49ee9931d1126 100644
--- a/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
+++ b/llvm/test/CodeGen/SPIRV/physical-layout/spirv-version.ll
@@ -6,6 +6,7 @@
 ; RUN: llc -O0 -mtriple=spirv64v1.4-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV14
 ; RUN: llc -O0 -mtriple=spirv64v1.5-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV15
 ; RUN: llc -O0 -mtriple=spirv64v1.6-unknown-unknown %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=CHECK-SPIRV16
+; RUN: llc -O0 -mtriple=spirv64-amd-amdhsa %s -o - --filetype=obj | spirv-dis | FileCheck %s --check-prefix=AMDGCNSPIRV
 
 ; CHECK-SPIRV10: Version: 1.0
 ; CHECK-SPIRV11: Version: 1.1
@@ -14,3 +15,4 @@
 ; CHECK-SPIRV14: Version: 1.4
 ; CHECK-SPIRV15: Version: 1.5
 ; CHECK-SPIRV16: Version: 1.6
+; AMDGCNSPIRV: Version: 1.6
diff --git a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
index 73c46b18bfa78..c9b2968a4aed7 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
@@ -10,6 +10,7 @@
 
 ; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0
 ; CHECK-DAG: %[[#Half:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#Half]]
 ; CHECK-DAG: %[[#Void:]] = OpTypeVoid
 ; CHECK-DAG: %[[#PtrInt8:]] = OpTypePointer CrossWorkgroup %[[#Int8:]]
@@ -17,12 +18,20 @@
 ; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0
 ; CHECK-DAG: %[[#PtrInt64:]] = OpTypePointer CrossWorkgroup %[[#Int64]]
 ; CHECK-DAG: %[[#BarType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt64]] %[[#Struct]]
+; CHECK-DAG: %[[#BazType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt8]] %[[#Struct]] %[[#Int8]] %[[#Struct]] %[[#Float]] %[[#Struct]]
 ; CHECK: OpFunction %[[#Void]] None %[[#FooType]]
 ; CHECK: OpFunctionParameter %[[#PtrInt8]]
 ; CHECK: OpFunctionParameter %[[#Struct]]
 ; CHECK: OpFunction %[[#Void]] None %[[#BarType]]
 ; CHECK: OpFunctionParameter %[[#PtrInt64]]
 ; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunction %[[#Void]] None %[[#BazType]]
+; CHECK: OpFunctionParameter %[[#PtrInt8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Int8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Float]]
+; CHECK: OpFunctionParameter %[[#Struct]]
 
 %t_half = type { half }
 
@@ -38,4 +47,9 @@ entry:
   ret void
 }
 
+define spir_kernel void @baz(ptr addrspace(1) %a, %t_half %b, i8 %c, %t_half %d, float %e, %t_half %f) {
+entry:
+  ret void
+}
+
 declare spir_func %t_half @_Z29__spirv_SpecConstantComposite(half)
diff --git a/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll
new file mode 100644
index 0000000000000..6a9ce4515f5c0
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/unpackfloat2x16.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: [[SET:%.*]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: [[UINT:%.*]] = OpTypeInt 32 0
+; CHECK-DAG: [[FLOAT:%.*]] = OpTypeFloat 32
+; CHECK-DAG: [[FLOAT2:%.*]] = OpTypeVector [[FLOAT]] 2
+
+; CHECK: [[P0:%.*]] = OpFunctionParameter [[UINT]]
+; CHECK: [[UNPACK2:%.*]] = OpExtInst [[FLOAT2]] [[SET]] UnpackHalf2x16 [[P0]]
+; CHECK: [[UNPACK:%.*]] = OpCompositeExtract [[FLOAT]] [[UNPACK2]] 0
+; CHECK: OpReturnValue [[UNPACK]]
+define hidden spir_func noundef nofpclass(nan inf) float @_Z9test_funcj(i32 noundef %0) local_unnamed_addr #0 {
+  %2 = tail call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.spv.unpackhalf2x16.v2f32(i32 %0)
+  %3 = extractelement <2 x float> %2, i64 0
+  ret float %3
+}
+
diff --git a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
index 678d9a9073155..ff9b6a34c1d53 100644
--- a/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
+++ b/llvm/test/CodeGen/SystemZ/regcoal_remat_empty_subrange.ll
@@ -22,10 +22,10 @@ define void @main(i16 %in) {
 ; CHECK-NEXT:    locghile %r3, 1
 ; CHECK-NEXT:    o %r0, 0(%r1)
 ; CHECK-NEXT:    larl %r1, g_222
-; CHECK-NEXT:    lghi %r5, 0
 ; CHECK-NEXT:    dsgfr %r2, %r0
+; CHECK-NEXT:    lghi %r3, 0
 ; CHECK-NEXT:    stgrl %r2, g_39
-; CHECK-NEXT:    stc %r5, 19(%r1)
+; CHECK-NEXT:    stc %r3, 19(%r1)
 ; CHECK-NEXT:    br %r14
   %tmp = load i32, ptr @g_151, align 4
   %tmp3 = or i32 %tmp, 1
diff --git a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
index 9c638199bb6e6..1cfda8a821bd6 100644
--- a/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
+++ b/llvm/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -10,7 +10,7 @@ declare i32 @has_ptr_arg(ptr)
 
 ; CHECK-LABEL: test_invalid_rtn:
 ; CHECK:      i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
-; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
+; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid.1, $pop[[L0]]{{$}}
 ; CHECK-NEXT: drop $pop[[L1]]{{$}}
 ; CHECK-NEXT: i64.const   $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: call $push[[L1:[0-9]+]]=, .Lhas_i64_arg_bitcast_invalid, $pop[[L0]]{{$}}
@@ -32,7 +32,7 @@ define void @test_struct_rtn() {
 
 ; CHECK-LABEL: test_invalid_arg:
 ; CHECK:      	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
-; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.4, $pop[[L0]]{{$}}
+; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, .Lhas_ptr_arg_bitcast_invalid.2, $pop[[L0]]{{$}}
 ; CHECK-NEXT: 	drop	$pop[[L1]]{{$}}
 ; CHECK-NEXT: 	i32.const	$push[[L0:[0-9]+]]=, 2{{$}}
 ; CHECK-NEXT: 	call	$push[[L1:[0-9]+]]=, has_ptr_arg, $pop[[L0]]{{$}}
@@ -54,8 +54,8 @@ entry:
 ; CHECK-NEXT:  unreachable
 ; CHECK-NEXT:  end_function
 
-; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.2:
-; CHECK-NEXT:  .functype .Lhas_i64_arg_bitcast_invalid.2 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_i64_arg_bitcast_invalid.1:
+; CHECK-NEXT:  .functype .Lhas_i64_arg_bitcast_invalid.1 (i32) -> (i32)
 ; CHECK-NEXT:  unreachable
 ; CHECK-NEXT:  end_function
 
@@ -64,7 +64,7 @@ entry:
 ; CHECK-NEXT: 	unreachable
 ; CHECK-NEXT: 	end_function
 
-; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.4:
-; CHECK-NEXT: 	.functype .Lhas_ptr_arg_bitcast_invalid.4 (i32) -> (i32)
+; CHECK-LABEL: .Lhas_ptr_arg_bitcast_invalid.2:
+; CHECK-NEXT: 	.functype .Lhas_ptr_arg_bitcast_invalid.2 (i32) -> (i32)
 ; CHECK-NEXT: 	unreachable
 ; CHECK-NEXT: 	end_function
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
index 87059c5d474e6..6ae7b2260c15c 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics-no-amx-bitcast.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
 
 define dso_local void @test_no_bitcast(ptr %A_mem, ptr %B_mem, ptr %C_mem) local_unnamed_addr #0 {
 ; CHECK-LABEL: @test_no_bitcast(
diff --git a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
index 5fb2dcdc1d621..ca7c3573a3294 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-low-intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64 -lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
+; RUN: opt -mtriple=x86_64 -passes=x86-lower-amx-intrinsics -enable-x86-scalar-amx=true %s -S | FileCheck %s
 
 define dso_local void @test_amx_load_non_O0(i16 signext %row, i16 signext %col, ptr%ptr, i64 %stride, ptr %vptr) {
 ; CHECK-LABEL: @test_amx_load_non_O0(
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
index 41e1b5bf22bf1..5c059a4e0539d 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-copy.mir
@@ -1,5 +1,6 @@
-# RUN: llc -mtriple=i386-linux-gnu   -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
-# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=i386-linux-gnu   -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X86
+# RUN: llc -mtriple=x86_64-linux-gnu -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,X64
 
 --- |
 
@@ -30,24 +31,23 @@
 ...
 ---
 name:            test_copy
-# ALL-LABEL: name:  test_copy
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
-# ALL:          %0:gr8 = COPY $al
-# ALL-NEXT:     %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT:     $eax = COPY %1
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+    ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $al
     %1(s32) = G_ZEXT %0(s8)
     $eax = COPY %1(s32)
@@ -56,24 +56,23 @@ body:             |
 ...
 ---
 name:            test_copy2
-# ALL-LABEL: name:  test_copy2
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
-# ALL:          %0:gr8 = COPY $al
-# ALL-NEXT:     %1:gr32 = MOVZX32rr8 %0
-# ALL-NEXT:     $eax = COPY %1
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy2
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr8 = COPY $al
+    ; CHECK-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $al
     %1(s32) = G_ZEXT %0(s8)
     $eax = COPY %1(s32)
@@ -82,30 +81,35 @@ body:             |
 ...
 ---
 name:            test_copy3
-# ALL-LABEL: name:  test_copy3
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr16[[ABCD:(_abcd)?]], preferred-register: '', flags: [   ] }
-# X32-NEXT:   - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [   ] }
-# X64-NEXT:   - { id: 1, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr16 = COPY $ax
-# X32-NEXT:     %3:gr16_abcd = COPY %0
-# X32-NEXT:     %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT:     %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; X86-LABEL: name: test_copy3
+    ; X86: liveins: $eax
+    ; X86-NEXT: {{  $}}
+    ; X86-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+    ; X86-NEXT: [[COPY1:%[0-9]+]]:gr16_abcd = COPY [[COPY]]
+    ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+    ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+    ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X86-NEXT: RET 0, implicit $eax
+    ;
+    ; X64-LABEL: name: test_copy3
+    ; X64: liveins: $eax
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:gr16 = COPY $ax
+    ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+    ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+    ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s16) = COPY $ax
     %1(s8) = G_TRUNC %0(s16)
     %2(s32) = G_ZEXT %1(s8)
@@ -115,27 +119,25 @@ body:             |
 ...
 ---
 name:            test_copy4
-# ALL-LABEL: name:  test_copy4
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr16, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr32 = COPY $eax
-# ALL-NEXT:     %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr16 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax
 
+    ; CHECK-LABEL: name: test_copy4
+    ; CHECK: liveins: $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $eax
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+    ; CHECK-NEXT: [[MOVZX32rr16_:%[0-9]+]]:gr32 = MOVZX32rr16 [[COPY1]]
+    ; CHECK-NEXT: $eax = COPY [[MOVZX32rr16_]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $eax
     %1(s16) = G_TRUNC %0(s32)
     %2(s32) = G_ZEXT %1(s16)
@@ -145,30 +147,35 @@ body:             |
 ...
 ---
 name:            test_copy5
-# ALL-LABEL: name:  test_copy5
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32[[ABCD:(_abcd)?]], preferred-register: '', flags: [   ] }
-# X32-NEXT:   - { id: 1, class: gr8_abcd_l, preferred-register: '', flags: [   ] }
-# X64-NEXT:   - { id: 1, class: gr8, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: gr32, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:          %0:gr32 = COPY $edx
-# X32-NEXT:     %3:gr32_abcd = COPY %0
-# X32-NEXT:     %1:gr8_abcd_l = COPY %3.sub_8bit
-# X64-NEXT:     %1:gr8 = COPY %0.sub_8bit
-# ALL-NEXT:     %2:gr32 = MOVZX32rr8 %1
-# ALL-NEXT:     $eax = COPY %2
-# ALL-NEXT:     RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax,$edx
 
+    ; X86-LABEL: name: test_copy5
+    ; X86: liveins: $eax, $edx
+    ; X86-NEXT: {{  $}}
+    ; X86-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; X86-NEXT: [[COPY1:%[0-9]+]]:gr32_abcd = COPY [[COPY]]
+    ; X86-NEXT: [[COPY2:%[0-9]+]]:gr8_abcd_l = COPY [[COPY1]].sub_8bit
+    ; X86-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY2]]
+    ; X86-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X86-NEXT: RET 0, implicit $eax
+    ;
+    ; X64-LABEL: name: test_copy5
+    ; X64: liveins: $eax, $edx
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; X64-NEXT: [[COPY1:%[0-9]+]]:gr8 = COPY [[COPY]].sub_8bit
+    ; X64-NEXT: [[MOVZX32rr8_:%[0-9]+]]:gr32 = MOVZX32rr8 [[COPY1]]
+    ; X64-NEXT: $eax = COPY [[MOVZX32rr8_]]
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $edx
     %1(s8) = G_TRUNC %0(s32)
     %2(s32) = G_ANYEXT %1(s8)
@@ -178,29 +185,26 @@ body:             |
 ...
 ---
 name:            test_copy6
-# ALL-LABEL: name:  test_copy6
 alignment:       16
 legalized:       true
 regBankSelected: true
-# ALL:      registers:
-# ALL-NEXT:   - { id: 0, class: gr32, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 1, class: gr16, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 2, class: low32_addr_access_rbp, preferred-register: '', flags: [   ] }
-# ALL-NEXT:   - { id: 3, class: low32_addr_access_rbp, preferred-register: '', flags: [   ] }
 registers:
   - { id: 0, class: gpr, preferred-register: '' }
   - { id: 1, class: gpr, preferred-register: '' }
   - { id: 2, class: gpr, preferred-register: '' }
-# ALL:         %0:gr32 = COPY $edx
-# ALL-NEXT:    %1:gr16 = COPY %0.sub_16bit
-# ALL-NEXT:    %3:low32_addr_access_rbp = IMPLICIT_DEF
-# ALL-NEXT:    %2:low32_addr_access_rbp = INSERT_SUBREG %3, %1, %subreg.sub_16bit
-# ALL-NEXT:    $eax = COPY %2
-# ALL-NEXT:    RET 0, implicit $eax
 body:             |
   bb.1 (%ir-block.0):
     liveins: $eax,$edx
 
+    ; CHECK-LABEL: name: test_copy6
+    ; CHECK: liveins: $eax, $edx
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edx
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr16 = COPY [[COPY]].sub_16bit
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:low32_addr_access_rbp = IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBREG:%[0-9]+]]:low32_addr_access_rbp = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.sub_16bit
+    ; CHECK-NEXT: $eax = COPY [[INSERT_SUBREG]]
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $edx
     %1(s16) = G_TRUNC %0(s32)
     %2(s32) = G_ANYEXT %1(s16)
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 8007d9dcf13bc..32d225273a6e1 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB5_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:  .LBB5_2:
-; X86-NEXT:    andl 4(%eax), %esi
-; X86-NEXT:    andl (%eax), %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $32, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ne_i64:
@@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB6_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB6_2:
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl %esi, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl %eax, %ebp
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    setne %al
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: complement_ne_i64:
@@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB7_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:  .LBB7_2:
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    movl 4(%edx), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    notl %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    andl %esi, %ebp
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %ecx, %edi
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    sete %al
-; X86-NEXT:    movl %esi, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: reset_eq_i64:
@@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB8_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB8_2:
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl %esi, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl %eax, %ebp
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    setne %al
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: set_ne_i64:
@@ -419,52 +353,26 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB9_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    notl %ebp
-; X86-NEXT:    je .LBB9_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:  .LBB9_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl (%edi), %ecx
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    andl %ecx, %ebp
-; X86-NEXT:    orl %esi, %ebp
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %ebp, (%edi)
-; X86-NEXT:    movl %ebx, 4(%edi)
-; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i64:
@@ -516,101 +424,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, (%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %esi
-; X86-NEXT:    movl 24(%esp,%esi), %edi
-; X86-NEXT:    movl 28(%esp,%esi), %eax
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 16(%esp,%esi), %edx
-; X86-NEXT:    movl 20(%esp,%esi), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl 8(%ebx), %edi
-; X86-NEXT:    andl (%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    andl 12(%ebx), %eax
-; X86-NEXT:    andl 4(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $96, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: test_ne_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %rsi, %rax
-; SSE-NEXT:    andq 8(%rdi), %rdx
-; SSE-NEXT:    andq (%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_ne_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    movl $1, %edx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shldq %cl, %rdx, %rsi
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rdx, %rsi
-; AVX2-NEXT:    cmovneq %rax, %rdx
-; AVX2-NEXT:    andq 8(%rdi), %rsi
-; AVX2-NEXT:    andq (%rdi), %rdx
-; AVX2-NEXT:    orq %rsi, %rdx
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_ne_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    xorl %edx, %edx
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shlxq %rcx, %rax, %rax
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rax, %rdx
-; AVX512-NEXT:    cmovneq %rsi, %rax
-; AVX512-NEXT:    andq 8(%rdi), %rdx
-; AVX512-NEXT:    andq (%rdi), %rax
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    retq
+; X64-LABEL: test_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $96, %eax
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -623,124 +455,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl 8(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 12(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: complement_ne_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    xorq %rcx, %rsi
-; SSE-NEXT:    xorq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: complement_ne_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    movq %rcx, %r8
-; AVX-NEXT:    andq %rsi, %r8
-; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    andq %rdx, %r9
-; AVX-NEXT:    xorq %rcx, %rsi
-; AVX-NEXT:    xorq %rax, %rdx
-; AVX-NEXT:    orq %r8, %r9
-; AVX-NEXT:    setne %al
-; AVX-NEXT:    movq %rdx, (%rdi)
-; AVX-NEXT:    movq %rsi, 8(%rdi)
-; AVX-NEXT:    retq
+; X64-LABEL: complement_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btcl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -755,124 +496,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %edx
-; X86-NEXT:    movl 60(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %esi
-; X86-NEXT:    movl 52(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl 8(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl (%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl 4(%ebx), %ebx
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl %ebx, %ecx
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    movl %edx, 8(%edi)
-; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl %esi, (%edi)
-; X86-NEXT:    movl %ecx, 4(%edi)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: reset_eq_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    notq %rdx
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    andq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: reset_eq_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    andnq %rcx, %rsi, %r8
-; AVX-NEXT:    andq %rsi, %rcx
-; AVX-NEXT:    andnq %rax, %rdx, %rsi
-; AVX-NEXT:    andq %rdx, %rax
-; AVX-NEXT:    orq %rcx, %rax
-; AVX-NEXT:    sete %al
-; AVX-NEXT:    movq %rsi, (%rdi)
-; AVX-NEXT:    movq %r8, 8(%rdi)
-; AVX-NEXT:    retq
+; X64-LABEL: reset_eq_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setae %al
+; X64-NEXT:    btrl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -888,350 +538,96 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: set_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btsl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %ld = load i128, ptr %word
+  %test = and i128 %ld, %bit
+  %res = or i128 %ld, %bit
+  %cmp = icmp ne i128 %test, 0
+  store i128 %res, ptr %word
+  ret i1 %cmp
+}
+
+define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
+; X86-LABEL: init_eq_i128:
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl 8(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 12(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: set_ne_i128:
+; SSE-LABEL: init_eq_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    orq %rcx, %rsi
-; SSE-NEXT:    orq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    andl $96, %esi
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    movl (%rdi,%rsi), %r8d
+; SSE-NEXT:    btl %ecx, %r8d
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    shll %cl, %edx
+; SSE-NEXT:    btrl %ecx, %r8d
+; SSE-NEXT:    orl %r8d, %edx
+; SSE-NEXT:    movl %edx, (%rdi,%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: set_ne_i128:
+; AVX-LABEL: init_eq_i128:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    movq %rcx, %r8
-; AVX-NEXT:    andq %rsi, %r8
-; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    andq %rdx, %r9
-; AVX-NEXT:    orq %rcx, %rsi
-; AVX-NEXT:    orq %rax, %rdx
-; AVX-NEXT:    orq %r8, %r9
-; AVX-NEXT:    setne %al
-; AVX-NEXT:    movq %rdx, (%rdi)
-; AVX-NEXT:    movq %rsi, 8(%rdi)
+; AVX-NEXT:    andl $96, %ecx
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    movl (%rdi,%rcx), %r8d
+; AVX-NEXT:    btl %esi, %r8d
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    btrl %esi, %r8d
+; AVX-NEXT:    shlxl %esi, %edx, %edx
+; AVX-NEXT:    orl %r8d, %edx
+; AVX-NEXT:    movl %edx, (%rdi,%rcx)
 ; AVX-NEXT:    retq
-  %rem = and i32 %position, 127
-  %ofs = zext nneg i32 %rem to i128
-  %bit = shl nuw i128 1, %ofs
-  %ld = load i128, ptr %word
-  %test = and i128 %ld, %bit
-  %res = or i128 %ld, %bit
-  %cmp = icmp ne i128 %test, 0
-  store i128 %res, ptr %word
-  ret i1 %cmp
-}
-
-define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
-; X86-LABEL: init_eq_i128:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movzbl 16(%ebp), %eax
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrb $3, %dl
-; X86-NEXT:    andb $12, %dl
-; X86-NEXT:    negb %dl
-; X86-NEXT:    movsbl %dl, %esi
-; X86-NEXT:    movl 64(%esp,%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%esi), %ebx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 76(%esp,%esi), %edi
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ecx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NEXT:    movl 104(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 108(%esp,%ebx), %ebx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 96(%esp,%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %eax, (%ecx)
-; X86-NEXT:    movl %edx, 4(%ecx)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; SSE-LABEL: init_eq_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %esi
-; SSE-NEXT:    xorl %r8d, %r8d
-; SSE-NEXT:    shldq %cl, %rsi, %r8
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    movl %edx, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    xorl %r9d, %r9d
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rsi, %r8
-; SSE-NEXT:    cmovneq %r9, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %r9, %rax
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %r9
-; SSE-NEXT:    movq %r9, %r10
-; SSE-NEXT:    andq %r8, %r10
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    movq %rcx, %r11
-; SSE-NEXT:    andq %rsi, %r11
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    andq %r9, %r8
-; SSE-NEXT:    orq %rdx, %r8
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    orq %r10, %r11
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    movq %r8, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: init_eq_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %esi
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    xorl %r8d, %r8d
-; AVX2-NEXT:    movl %edx, %edx
-; AVX2-NEXT:    xorl %r9d, %r9d
-; AVX2-NEXT:    shldq %cl, %rdx, %r9
-; AVX2-NEXT:    shlxq %rcx, %rsi, %rsi
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rsi, %rax
-; AVX2-NEXT:    cmovneq %r8, %rsi
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT:    cmovneq %rcx, %r9
-; AVX2-NEXT:    cmovneq %r8, %rcx
-; AVX2-NEXT:    movq (%rdi), %rdx
-; AVX2-NEXT:    movq 8(%rdi), %r8
-; AVX2-NEXT:    andnq %r8, %rax, %r10
-; AVX2-NEXT:    andq %rax, %r8
-; AVX2-NEXT:    andnq %rdx, %rsi, %r11
-; AVX2-NEXT:    andq %rsi, %rdx
-; AVX2-NEXT:    orq %r9, %r10
-; AVX2-NEXT:    orq %rcx, %r11
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    movq %r11, (%rdi)
-; AVX2-NEXT:    movq %r10, 8(%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movl $1, %esi
-; AVX512-NEXT:    xorl %r8d, %r8d
-; AVX512-NEXT:    shldq %cl, %rsi, %r8
-; AVX512-NEXT:    shlxq %rcx, %rsi, %rsi
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    xorl %r9d, %r9d
-; AVX512-NEXT:    shldq %cl, %rdx, %r9
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rsi, %r8
-; AVX512-NEXT:    cmovneq %rax, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT:    cmovneq %rcx, %r9
-; AVX512-NEXT:    cmovneq %rax, %rcx
-; AVX512-NEXT:    movq (%rdi), %rax
-; AVX512-NEXT:    movq 8(%rdi), %rdx
-; AVX512-NEXT:    andnq %rdx, %r8, %r10
-; AVX512-NEXT:    andq %r8, %rdx
-; AVX512-NEXT:    andnq %rax, %rsi, %r8
-; AVX512-NEXT:    andq %rsi, %rax
-; AVX512-NEXT:    orq %r9, %r10
-; AVX512-NEXT:    orq %rcx, %r8
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    movq %r8, (%rdi)
-; AVX512-NEXT:    movq %r10, 8(%rdi)
-; AVX512-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -1252,344 +648,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $224, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %eax
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    andl 40(%ebx), %eax
-; X86-NEXT:    andl 8(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 56(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 24(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    andl 44(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 12(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 60(%edi), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 28(%edi), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl 192(%esp,%edx), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    andl 32(%ebx), %ecx
-; X86-NEXT:    andl (%ebx), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    andl 16(%ebx), %edi
-; X86-NEXT:    andl 48(%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 36(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 4(%ebx), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 20(%ebx), %ecx
-; X86-NEXT:    andl 52(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $60, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: test_ne_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq -48(%rsp,%rbx), %rdx
-; SSE-NEXT:    movq -40(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq -16(%rsp,%rbx), %r11
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r10
-; SSE-NEXT:    shldq %cl, %r11, %r10
-; SSE-NEXT:    movq -32(%rsp,%rbx), %r9
-; SSE-NEXT:    movq -24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r8
-; SSE-NEXT:    shldq %cl, %r9, %r8
-; SSE-NEXT:    movq -56(%rsp,%rbx), %rsi
-; SSE-NEXT:    shldq %cl, %rsi, %rdx
-; SSE-NEXT:    shldq %cl, %r15, %r11
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -64(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %rsi
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    andq 32(%rdi), %r9
-; SSE-NEXT:    andq 48(%rdi), %r11
-; SSE-NEXT:    andq 16(%rdi), %rdx
-; SSE-NEXT:    orq %r11, %rdx
-; SSE-NEXT:    andq 40(%rdi), %r8
-; SSE-NEXT:    andq 56(%rdi), %r10
-; SSE-NEXT:    andq 24(%rdi), %rax
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    andq (%rdi), %rbx
-; SSE-NEXT:    orq %r9, %rbx
-; SSE-NEXT:    orq %rdx, %rbx
-; SSE-NEXT:    andq 8(%rdi), %rsi
-; SSE-NEXT:    orq %r8, %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    orq %rbx, %rsi
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_ne_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rsi
-; AVX2-NEXT:    movq -48(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq -40(%rsp,%rsi), %rbx
-; AVX2-NEXT:    movq %rbx, %rax
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq -16(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq -8(%rsp,%rsi), %r10
-; AVX2-NEXT:    shldq %cl, %r11, %r10
-; AVX2-NEXT:    movq -32(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq -24(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq %r14, %r8
-; AVX2-NEXT:    shldq %cl, %r9, %r8
-; AVX2-NEXT:    movq -64(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq -56(%rsp,%rsi), %rsi
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    shldq %cl, %r14, %r11
-; AVX2-NEXT:    shldq %cl, %rbx, %r9
-; AVX2-NEXT:    shldq %cl, %r15, %rsi
-; AVX2-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX2-NEXT:    andq 32(%rdi), %r9
-; AVX2-NEXT:    andq 48(%rdi), %r11
-; AVX2-NEXT:    andq 16(%rdi), %rdx
-; AVX2-NEXT:    andq 40(%rdi), %r8
-; AVX2-NEXT:    andq 56(%rdi), %r10
-; AVX2-NEXT:    andq 24(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %rdx
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    andq (%rdi), %rcx
-; AVX2-NEXT:    orq %r9, %rcx
-; AVX2-NEXT:    orq %rdx, %rcx
-; AVX2-NEXT:    andq 8(%rdi), %rsi
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    orq %rax, %rsi
-; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq -48(%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq -16(%rsp,%rbx), %r11
-; AVX512-NEXT:    movq -8(%rsp,%rbx), %r10
-; AVX512-NEXT:    shldq %cl, %r11, %r10
-; AVX512-NEXT:    movq -32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT:    movq %r15, %r8
-; AVX512-NEXT:    shldq %cl, %r9, %r8
-; AVX512-NEXT:    movq -56(%rsp,%rbx), %rsi
-; AVX512-NEXT:    shldq %cl, %rsi, %rdx
-; AVX512-NEXT:    shldq %cl, %r15, %r11
-; AVX512-NEXT:    shldq %cl, %r14, %r9
-; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT:    shldq %cl, %rbx, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT:    andq 32(%rdi), %r9
-; AVX512-NEXT:    andq 48(%rdi), %r11
-; AVX512-NEXT:    andq 16(%rdi), %rdx
-; AVX512-NEXT:    andq 40(%rdi), %r8
-; AVX512-NEXT:    andq 56(%rdi), %r10
-; AVX512-NEXT:    andq 24(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %rdx
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    andq (%rdi), %rcx
-; AVX512-NEXT:    orq %r9, %rcx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    andq 8(%rdi), %rsi
-; AVX512-NEXT:    orq %r8, %rsi
-; AVX512-NEXT:    orq %rax, %rsi
-; AVX512-NEXT:    orq %rcx, %rsi
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: test_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $60, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -1602,572 +679,33 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $272, %esp # imm = 0x110
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl 56(%edx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl 24(%edx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl 60(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 240(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 32(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 16(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl 52(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ebx, 60(%edx)
-; X86-NEXT:    movl %edi, 56(%edx)
-; X86-NEXT:    movl %ecx, 52(%edx)
-; X86-NEXT:    movl %esi, 44(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 36(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 32(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 8(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 48(%edx)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: complement_ne_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq (%rsp,%rbx), %rsi
-; SSE-NEXT:    movq 8(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rsi, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 32(%rsp,%rbx), %r8
-; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT:    shldq %cl, %r8, %rbp
-; SSE-NEXT:    movq 16(%rsp,%rbx), %r9
-; SSE-NEXT:    movq 24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r11
-; SSE-NEXT:    shldq %cl, %r11, %rsi
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %r11
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    movq 24(%rdi), %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 16(%rdi), %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %r8, %r13
-; SSE-NEXT:    andq %rsi, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %rcx, %r13
-; SSE-NEXT:    andq %rbp, %r13
-; SSE-NEXT:    andq %rax, %r15
-; SSE-NEXT:    orq %r13, %r15
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    movq %r14, %rcx
-; SSE-NEXT:    andq %r9, %rcx
-; SSE-NEXT:    movq (%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rbx, %r13
-; SSE-NEXT:    orq %rcx, %r13
-; SSE-NEXT:    orq %r12, %r13
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r12
-; SSE-NEXT:    andq %r10, %r12
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, %rax
-; SSE-NEXT:    andq %r11, %rax
-; SSE-NEXT:    orq %r12, %rax
-; SSE-NEXT:    orq %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    xorq %rcx, %r10
-; SSE-NEXT:    xorq %r14, %r9
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT:    xorq %rdx, %r11
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    movq %r8, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r9, 32(%rdi)
-; SSE-NEXT:    movq %r10, 40(%rdi)
-; SSE-NEXT:    movq %rsi, 16(%rdi)
-; SSE-NEXT:    movq %r15, 24(%rdi)
-; SSE-NEXT:    movq %rbx, (%rdi)
-; SSE-NEXT:    movq %r11, 8(%rdi)
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $56, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: complement_ne_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $72, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, (%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rbx
-; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT:    movq %rbp, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT:    shldq %cl, %r8, %r13
-; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    shldq %cl, %r9, %r10
-; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT:    shldq %cl, %r11, %rsi
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r8, %r14
-; AVX2-NEXT:    andq %rsi, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq 56(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r13, %r15
-; AVX2-NEXT:    movq 24(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %rax, %r14
-; AVX2-NEXT:    orq %r15, %r14
-; AVX2-NEXT:    shldq %cl, %rbp, %r9
-; AVX2-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX2-NEXT:    movq 32(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r9, %r15
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq (%rdi), %rbx
-; AVX2-NEXT:    movq %rbx, %rbp
-; AVX2-NEXT:    andq %rax, %rbp
-; AVX2-NEXT:    orq %r15, %rbp
-; AVX2-NEXT:    orq %r12, %rbp
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    andq %r10, %rcx
-; AVX2-NEXT:    movq 8(%rdi), %r15
-; AVX2-NEXT:    movq %r15, %r12
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    orq %rcx, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT:    xorq %rax, %r10
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq %r15, %r11
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    movq %r8, 48(%rdi)
-; AVX2-NEXT:    movq %r13, 56(%rdi)
-; AVX2-NEXT:    movq %r9, 32(%rdi)
-; AVX2-NEXT:    movq %r10, 40(%rdi)
-; AVX2-NEXT:    movq %rsi, 16(%rdi)
-; AVX2-NEXT:    movq %rcx, 24(%rdi)
-; AVX2-NEXT:    movq %rbx, (%rdi)
-; AVX2-NEXT:    movq %r11, 8(%rdi)
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $72, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: complement_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $72, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT:    movq %rbp, %rax
-; AVX512-NEXT:    shldq %cl, %rsi, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT:    shldq %cl, %r8, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    shldq %cl, %r9, %r10
-; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT:    shldq %cl, %r11, %rsi
-; AVX512-NEXT:    shldq %cl, %r14, %r8
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r8, %r14
-; AVX512-NEXT:    andq %rsi, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq 56(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r13, %r15
-; AVX512-NEXT:    movq 24(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %rax, %r14
-; AVX512-NEXT:    orq %r15, %r14
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq 32(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r9, %r15
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq (%rdi), %rbx
-; AVX512-NEXT:    movq %rbx, %rbp
-; AVX512-NEXT:    andq %rax, %rbp
-; AVX512-NEXT:    orq %r15, %rbp
-; AVX512-NEXT:    orq %r12, %rbp
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rdx, %r11
-; AVX512-NEXT:    movq 40(%rdi), %rax
-; AVX512-NEXT:    movq %rax, %rcx
-; AVX512-NEXT:    andq %r10, %rcx
-; AVX512-NEXT:    movq 8(%rdi), %r15
-; AVX512-NEXT:    movq %r15, %r12
-; AVX512-NEXT:    andq %r11, %r12
-; AVX512-NEXT:    orq %rcx, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    xorq %rax, %r10
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq %r15, %r11
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rbp, %r12
-; AVX512-NEXT:    movq %r8, 48(%rdi)
-; AVX512-NEXT:    movq %r13, 56(%rdi)
-; AVX512-NEXT:    movq %r9, 32(%rdi)
-; AVX512-NEXT:    movq %r10, 40(%rdi)
-; AVX512-NEXT:    movq %rsi, 16(%rdi)
-; AVX512-NEXT:    movq %rcx, 24(%rdi)
-; AVX512-NEXT:    movq %rbx, (%rdi)
-; AVX512-NEXT:    movq %r11, 8(%rdi)
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $72, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: complement_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btcl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -2182,606 +720,33 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $288, %esp # imm = 0x120
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 4(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edi), %eax
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl 12(%edi), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edi), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edi), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edi), %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl 52(%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 56(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl 44(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 256(%esp,%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl 32(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    movl 52(%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 60(%eax)
-; X86-NEXT:    movl %esi, 56(%eax)
-; X86-NEXT:    movl %ecx, 52(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl %ebx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 48(%eax)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: reset_eq_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rdx
-; SSE-NEXT:    movq (%rsp,%rdx), %r9
-; SSE-NEXT:    movq 8(%rsp,%rdx), %r8
-; SSE-NEXT:    movq %r8, %rsi
-; SSE-NEXT:    shldq %cl, %r9, %rsi
-; SSE-NEXT:    movq -8(%rsp,%rdx), %rax
-; SSE-NEXT:    shldq %cl, %rax, %r9
-; SSE-NEXT:    movq 16(%rsp,%rdx), %r14
-; SSE-NEXT:    movq 24(%rsp,%rdx), %r10
-; SSE-NEXT:    movq %r10, %rbx
-; SSE-NEXT:    shldq %cl, %r14, %rbx
-; SSE-NEXT:    shldq %cl, %r8, %r14
-; SSE-NEXT:    movq 32(%rsp,%rdx), %r13
-; SSE-NEXT:    movq 40(%rsp,%rdx), %r12
-; SSE-NEXT:    shldq %cl, %r13, %r12
-; SSE-NEXT:    shldq %cl, %r10, %r13
-; SSE-NEXT:    movq -16(%rsp,%rdx), %rdx
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %r12, %rbp
-; SSE-NEXT:    movq %r9, %r15
-; SSE-NEXT:    movq %rsi, %r11
-; SSE-NEXT:    movq 16(%rdi), %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r13
-; SSE-NEXT:    andq %r8, %r9
-; SSE-NEXT:    orq %r13, %r9
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r12
-; SSE-NEXT:    movq 24(%rdi), %r10
-; SSE-NEXT:    andq %r10, %rsi
-; SSE-NEXT:    orq %r12, %rsi
-; SSE-NEXT:    movq %r14, %r13
-; SSE-NEXT:    movq 32(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r14
-; SSE-NEXT:    movq %rdx, %r12
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %rdx
-; SSE-NEXT:    orq %r14, %rdx
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    movq %rbx, %r14
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    andq %rcx, %rbx
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    movq 8(%rdi), %r8
-; SSE-NEXT:    andq %r8, %rax
-; SSE-NEXT:    orq %rbx, %rax
-; SSE-NEXT:    orq %rsi, %rax
-; SSE-NEXT:    notq %r11
-; SSE-NEXT:    andq %r10, %r11
-; SSE-NEXT:    notq %r15
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    notq %r14
-; SSE-NEXT:    andq %rcx, %r14
-; SSE-NEXT:    notq %r13
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT:    notq %rbp
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    notq %rcx
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT:    notq %r9
-; SSE-NEXT:    andq %r8, %r9
-; SSE-NEXT:    notq %r12
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rcx, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r13, 32(%rdi)
-; SSE-NEXT:    movq %r14, 40(%rdi)
-; SSE-NEXT:    movq %r15, 16(%rdi)
-; SSE-NEXT:    movq %r11, 24(%rdi)
-; SSE-NEXT:    movq %r12, (%rdi)
-; SSE-NEXT:    movq %r9, 8(%rdi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    addq $56, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: reset_eq_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rdx
-; AVX2-NEXT:    movq -48(%rsp,%rdx), %r8
-; AVX2-NEXT:    movq -40(%rsp,%rdx), %rbx
-; AVX2-NEXT:    movq %rbx, %rax
-; AVX2-NEXT:    shldq %cl, %r8, %rax
-; AVX2-NEXT:    movq -16(%rsp,%rdx), %r10
-; AVX2-NEXT:    movq -8(%rsp,%rdx), %rsi
-; AVX2-NEXT:    shldq %cl, %r10, %rsi
-; AVX2-NEXT:    movq -32(%rsp,%rdx), %r11
-; AVX2-NEXT:    movq -24(%rsp,%rdx), %r14
-; AVX2-NEXT:    movq %r14, %r9
-; AVX2-NEXT:    shldq %cl, %r11, %r9
-; AVX2-NEXT:    movq -64(%rsp,%rdx), %r15
-; AVX2-NEXT:    movq -56(%rsp,%rdx), %rdx
-; AVX2-NEXT:    shldq %cl, %rdx, %r8
-; AVX2-NEXT:    shldq %cl, %r14, %r10
-; AVX2-NEXT:    shldq %cl, %rbx, %r11
-; AVX2-NEXT:    shldq %cl, %r15, %rdx
-; AVX2-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX2-NEXT:    movq 24(%rdi), %rbx
-; AVX2-NEXT:    movq 56(%rdi), %r14
-; AVX2-NEXT:    movq 16(%rdi), %r15
-; AVX2-NEXT:    movq 48(%rdi), %r13
-; AVX2-NEXT:    movq 32(%rdi), %rbp
-; AVX2-NEXT:    andnq %rbp, %r11, %r12
-; AVX2-NEXT:    andq %r11, %rbp
-; AVX2-NEXT:    andnq %r13, %r10, %r11
-; AVX2-NEXT:    andq %r10, %r13
-; AVX2-NEXT:    andnq %r15, %r8, %r10
-; AVX2-NEXT:    andq %r8, %r15
-; AVX2-NEXT:    movq 40(%rdi), %r8
-; AVX2-NEXT:    orq %r13, %r15
-; AVX2-NEXT:    andnq %r8, %r9, %r13
-; AVX2-NEXT:    andq %r9, %r8
-; AVX2-NEXT:    andnq %r14, %rsi, %r9
-; AVX2-NEXT:    andq %rsi, %r14
-; AVX2-NEXT:    andnq %rbx, %rax, %rsi
-; AVX2-NEXT:    andq %rax, %rbx
-; AVX2-NEXT:    movq (%rdi), %rax
-; AVX2-NEXT:    orq %r14, %rbx
-; AVX2-NEXT:    andnq %rax, %rcx, %r14
-; AVX2-NEXT:    andq %rcx, %rax
-; AVX2-NEXT:    orq %rbp, %rax
-; AVX2-NEXT:    movq 8(%rdi), %rcx
-; AVX2-NEXT:    orq %r15, %rax
-; AVX2-NEXT:    andnq %rcx, %rdx, %r15
-; AVX2-NEXT:    andq %rdx, %rcx
-; AVX2-NEXT:    orq %r8, %rcx
-; AVX2-NEXT:    orq %rbx, %rcx
-; AVX2-NEXT:    orq %rax, %rcx
-; AVX2-NEXT:    movq %r11, 48(%rdi)
-; AVX2-NEXT:    movq %r9, 56(%rdi)
-; AVX2-NEXT:    movq %r12, 32(%rdi)
-; AVX2-NEXT:    movq %r13, 40(%rdi)
-; AVX2-NEXT:    movq %r10, 16(%rdi)
-; AVX2-NEXT:    movq %rsi, 24(%rdi)
-; AVX2-NEXT:    movq %r14, (%rdi)
-; AVX2-NEXT:    movq %r15, 8(%rdi)
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: reset_eq_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq -48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    shldq %cl, %r8, %rax
-; AVX512-NEXT:    movq -16(%rsp,%rbx), %r10
-; AVX512-NEXT:    movq -8(%rsp,%rbx), %rsi
-; AVX512-NEXT:    shldq %cl, %r10, %rsi
-; AVX512-NEXT:    movq -32(%rsp,%rbx), %r11
-; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT:    movq %r15, %r9
-; AVX512-NEXT:    shldq %cl, %r11, %r9
-; AVX512-NEXT:    movq -56(%rsp,%rbx), %rdx
-; AVX512-NEXT:    shldq %cl, %rdx, %r8
-; AVX512-NEXT:    shldq %cl, %r15, %r10
-; AVX512-NEXT:    shldq %cl, %r14, %r11
-; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT:    shldq %cl, %rbx, %rdx
-; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT:    movq 24(%rdi), %rbx
-; AVX512-NEXT:    movq 56(%rdi), %r14
-; AVX512-NEXT:    movq 16(%rdi), %r15
-; AVX512-NEXT:    movq 48(%rdi), %r13
-; AVX512-NEXT:    movq 32(%rdi), %rbp
-; AVX512-NEXT:    andnq %rbp, %r11, %r12
-; AVX512-NEXT:    andq %r11, %rbp
-; AVX512-NEXT:    andnq %r13, %r10, %r11
-; AVX512-NEXT:    andq %r10, %r13
-; AVX512-NEXT:    andnq %r15, %r8, %r10
-; AVX512-NEXT:    andq %r8, %r15
-; AVX512-NEXT:    movq 40(%rdi), %r8
-; AVX512-NEXT:    orq %r13, %r15
-; AVX512-NEXT:    andnq %r8, %r9, %r13
-; AVX512-NEXT:    andq %r9, %r8
-; AVX512-NEXT:    andnq %r14, %rsi, %r9
-; AVX512-NEXT:    andq %rsi, %r14
-; AVX512-NEXT:    andnq %rbx, %rax, %rsi
-; AVX512-NEXT:    andq %rax, %rbx
-; AVX512-NEXT:    movq (%rdi), %rax
-; AVX512-NEXT:    orq %r14, %rbx
-; AVX512-NEXT:    andnq %rax, %rcx, %r14
-; AVX512-NEXT:    andq %rcx, %rax
-; AVX512-NEXT:    orq %rbp, %rax
-; AVX512-NEXT:    movq 8(%rdi), %rcx
-; AVX512-NEXT:    orq %r15, %rax
-; AVX512-NEXT:    andnq %rcx, %rdx, %r15
-; AVX512-NEXT:    andq %rdx, %rcx
-; AVX512-NEXT:    orq %r8, %rcx
-; AVX512-NEXT:    orq %rbx, %rcx
-; AVX512-NEXT:    orq %rax, %rcx
-; AVX512-NEXT:    movq %r11, 48(%rdi)
-; AVX512-NEXT:    movq %r9, 56(%rdi)
-; AVX512-NEXT:    movq %r12, 32(%rdi)
-; AVX512-NEXT:    movq %r13, 40(%rdi)
-; AVX512-NEXT:    movq %r10, 16(%rdi)
-; AVX512-NEXT:    movq %rsi, 24(%rdi)
-; AVX512-NEXT:    movq %r14, (%rdi)
-; AVX512-NEXT:    movq %r15, 8(%rdi)
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: reset_eq_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setae %al
+; X64-NEXT:    btrl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -2797,4226 +762,135 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: set_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btsl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 511
+  %ofs = zext nneg i32 %rem to i512
+  %bit = shl nuw i512 1, %ofs
+  %ld = load i512, ptr %word
+  %test = and i512 %ld, %bit
+  %res = or i512 %ld, %bit
+  %cmp = icmp ne i512 %test, 0
+  store i512 %res, ptr %word
+  ret i1 %cmp
+}
+
+define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
+; X86-LABEL: init_eq_i512:
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $272, %esp # imm = 0x110
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl 56(%edx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl 24(%edx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl 60(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 240(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 32(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 16(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl 52(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ebx, 60(%edx)
-; X86-NEXT:    movl %edi, 56(%edx)
-; X86-NEXT:    movl %ecx, 52(%edx)
-; X86-NEXT:    movl %esi, 44(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 36(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 32(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 8(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 48(%edx)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: set_ne_i512:
+; SSE-LABEL: init_eq_i512:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq (%rsp,%rbx), %rsi
-; SSE-NEXT:    movq 8(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rsi, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 32(%rsp,%rbx), %r8
-; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT:    shldq %cl, %r8, %rbp
-; SSE-NEXT:    movq 16(%rsp,%rbx), %r9
-; SSE-NEXT:    movq 24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r11
-; SSE-NEXT:    shldq %cl, %r11, %rsi
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %r11
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    movq 24(%rdi), %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 16(%rdi), %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %r8, %r13
-; SSE-NEXT:    andq %rsi, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %rcx, %r13
-; SSE-NEXT:    andq %rbp, %r13
-; SSE-NEXT:    andq %rax, %r15
-; SSE-NEXT:    orq %r13, %r15
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    movq %r14, %rcx
-; SSE-NEXT:    andq %r9, %rcx
-; SSE-NEXT:    movq (%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rbx, %r13
-; SSE-NEXT:    orq %rcx, %r13
-; SSE-NEXT:    orq %r12, %r13
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r12
-; SSE-NEXT:    andq %r10, %r12
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, %rax
-; SSE-NEXT:    andq %r11, %rax
-; SSE-NEXT:    orq %r12, %rax
-; SSE-NEXT:    orq %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    orq %rcx, %r10
-; SSE-NEXT:    orq %r14, %r9
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    movq %r8, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r9, 32(%rdi)
-; SSE-NEXT:    movq %r10, 40(%rdi)
-; SSE-NEXT:    movq %rsi, 16(%rdi)
-; SSE-NEXT:    movq %r15, 24(%rdi)
-; SSE-NEXT:    movq %rbx, (%rdi)
-; SSE-NEXT:    movq %r11, 8(%rdi)
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $56, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    andl $60, %esi
+; SSE-NEXT:    movl (%rdi,%rsi), %r8d
+; SSE-NEXT:    btl %ecx, %r8d
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    shll %cl, %edx
+; SSE-NEXT:    btrl %ecx, %r8d
+; SSE-NEXT:    orl %r8d, %edx
+; SSE-NEXT:    movl %edx, (%rdi,%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: set_ne_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $72, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, (%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rbx
-; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT:    movq %rbp, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT:    shldq %cl, %r8, %r13
-; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    shldq %cl, %r9, %r10
-; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT:    shldq %cl, %r11, %rsi
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r8, %r14
-; AVX2-NEXT:    andq %rsi, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq 56(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r13, %r15
-; AVX2-NEXT:    movq 24(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %rax, %r14
-; AVX2-NEXT:    orq %r15, %r14
-; AVX2-NEXT:    shldq %cl, %rbp, %r9
-; AVX2-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX2-NEXT:    movq 32(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r9, %r15
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq (%rdi), %rbx
-; AVX2-NEXT:    movq %rbx, %rbp
-; AVX2-NEXT:    andq %rax, %rbp
-; AVX2-NEXT:    orq %r15, %rbp
-; AVX2-NEXT:    orq %r12, %rbp
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    andq %r10, %rcx
-; AVX2-NEXT:    movq 8(%rdi), %r15
-; AVX2-NEXT:    movq %r15, %r12
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    orq %rcx, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rax, %r10
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %r15, %r11
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    movq %r8, 48(%rdi)
-; AVX2-NEXT:    movq %r13, 56(%rdi)
-; AVX2-NEXT:    movq %r9, 32(%rdi)
-; AVX2-NEXT:    movq %r10, 40(%rdi)
-; AVX2-NEXT:    movq %rsi, 16(%rdi)
-; AVX2-NEXT:    movq %rcx, 24(%rdi)
-; AVX2-NEXT:    movq %rbx, (%rdi)
-; AVX2-NEXT:    movq %r11, 8(%rdi)
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $72, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: set_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $72, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT:    movq %rbp, %rax
-; AVX512-NEXT:    shldq %cl, %rsi, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT:    shldq %cl, %r8, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    shldq %cl, %r9, %r10
-; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT:    shldq %cl, %r11, %rsi
-; AVX512-NEXT:    shldq %cl, %r14, %r8
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r8, %r14
-; AVX512-NEXT:    andq %rsi, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq 56(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r13, %r15
-; AVX512-NEXT:    movq 24(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %rax, %r14
-; AVX512-NEXT:    orq %r15, %r14
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq 32(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r9, %r15
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq (%rdi), %rbx
-; AVX512-NEXT:    movq %rbx, %rbp
-; AVX512-NEXT:    andq %rax, %rbp
-; AVX512-NEXT:    orq %r15, %rbp
-; AVX512-NEXT:    orq %r12, %rbp
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rdx, %r11
-; AVX512-NEXT:    movq 40(%rdi), %rax
-; AVX512-NEXT:    movq %rax, %rcx
-; AVX512-NEXT:    andq %r10, %rcx
-; AVX512-NEXT:    movq 8(%rdi), %r15
-; AVX512-NEXT:    movq %r15, %r12
-; AVX512-NEXT:    andq %r11, %r12
-; AVX512-NEXT:    orq %rcx, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rax, %r10
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT:    orq %r15, %r11
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rbp, %r12
-; AVX512-NEXT:    movq %r8, 48(%rdi)
-; AVX512-NEXT:    movq %r13, 56(%rdi)
-; AVX512-NEXT:    movq %r9, 32(%rdi)
-; AVX512-NEXT:    movq %r10, 40(%rdi)
-; AVX512-NEXT:    movq %rsi, 16(%rdi)
-; AVX512-NEXT:    movq %rcx, 24(%rdi)
-; AVX512-NEXT:    movq %rbx, (%rdi)
-; AVX512-NEXT:    movq %r11, 8(%rdi)
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $72, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: init_eq_i512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    andl $60, %ecx
+; AVX-NEXT:    movl (%rdi,%rcx), %r8d
+; AVX-NEXT:    btl %esi, %r8d
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    btrl %esi, %r8d
+; AVX-NEXT:    shlxl %esi, %edx, %edx
+; AVX-NEXT:    orl %r8d, %edx
+; AVX-NEXT:    movl %edx, (%rdi,%rcx)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
-  %ld = load i512, ptr %word
-  %test = and i512 %ld, %bit
-  %res = or i512 %ld, %bit
-  %cmp = icmp ne i512 %test, 0
-  store i512 %res, ptr %word
-  ret i1 %cmp
-}
-
-define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
-; X86-LABEL: init_eq_i512:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $432, %esp # imm = 0x1B0
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 56(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esi), %eax
-; X86-NEXT:    movl 48(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%esi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl 16(%ebp), %ebx
-; X86-NEXT:    movzbl %bl, %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%ebx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl 56(%edi), %ebx
-; X86-NEXT:    movl 60(%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 52(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 48(%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 40(%edi), %ebx
-; X86-NEXT:    movl 44(%edi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 32(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 24(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%edi), %eax
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%edi), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%edi), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edx
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl (%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 60(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 56(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 52(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %esi, 48(%eax)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
-; X86-NEXT:    retl
-;
-; SSE-LABEL: init_eq_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $216, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %r10
-; SSE-NEXT:    movq 184(%rsp,%r10), %r11
-; SSE-NEXT:    movq 192(%rsp,%r10), %rsi
-; SSE-NEXT:    movq %rsi, %r13
-; SSE-NEXT:    shldq %cl, %r11, %r13
-; SSE-NEXT:    movq 200(%rsp,%r10), %r15
-; SSE-NEXT:    shldq %cl, %rsi, %r15
-; SSE-NEXT:    movq 168(%rsp,%r10), %rbx
-; SSE-NEXT:    movq 176(%rsp,%r10), %rsi
-; SSE-NEXT:    movq %rsi, %r14
-; SSE-NEXT:    shldq %cl, %rbx, %r14
-; SSE-NEXT:    shldq %cl, %rsi, %r11
-; SSE-NEXT:    movq 152(%rsp,%r10), %rax
-; SSE-NEXT:    movq 160(%rsp,%r10), %r8
-; SSE-NEXT:    movq %r8, %r12
-; SSE-NEXT:    shldq %cl, %rax, %r12
-; SSE-NEXT:    shldq %cl, %r8, %rbx
-; SSE-NEXT:    movq 144(%rsp,%r10), %r9
-; SSE-NEXT:    movq %r9, %r8
-; SSE-NEXT:    shlq %cl, %r8
-; SSE-NEXT:    shldq %cl, %r9, %rax
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movl %edx, %edx
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, (%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq 16(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %rsi
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rsi, %r13
-; SSE-NEXT:    andq %rdx, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %r15, %rsi
-; SSE-NEXT:    movq 56(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r15
-; SSE-NEXT:    movq %rbx, %r13
-; SSE-NEXT:    movq 24(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %rbx
-; SSE-NEXT:    orq %r15, %rbx
-; SSE-NEXT:    movq %r14, %rbp
-; SSE-NEXT:    movq 32(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r14
-; SSE-NEXT:    movq %r8, %r15
-; SSE-NEXT:    movq (%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r8
-; SSE-NEXT:    orq %r14, %r8
-; SSE-NEXT:    orq %r12, %r8
-; SSE-NEXT:    movq %r11, %r12
-; SSE-NEXT:    movq 40(%rdi), %r9
-; SSE-NEXT:    andq %r9, %r11
-; SSE-NEXT:    movq %rax, %r14
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %rax
-; SSE-NEXT:    orq %r11, %rax
-; SSE-NEXT:    orq %rbx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    notq %rax
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq 56(%rsp,%r10), %r11
-; SSE-NEXT:    movq 64(%rsp,%r10), %rax
-; SSE-NEXT:    movq %rax, %rbx
-; SSE-NEXT:    shldq %cl, %r11, %rbx
-; SSE-NEXT:    orq %rbx, %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    movq 72(%rsp,%r10), %rbx
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbx, %rsi
-; SSE-NEXT:    notq %rbp
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    movq 40(%rsp,%r10), %rax
-; SSE-NEXT:    movq 48(%rsp,%r10), %rdx
-; SSE-NEXT:    movq %rdx, %rbx
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    orq %rbx, %rbp
-; SSE-NEXT:    notq %r12
-; SSE-NEXT:    andq %r9, %r12
-; SSE-NEXT:    shldq %cl, %rdx, %r11
-; SSE-NEXT:    movq 24(%rsp,%r10), %r9
-; SSE-NEXT:    movq 32(%rsp,%r10), %rdx
-; SSE-NEXT:    movq %rdx, %rbx
-; SSE-NEXT:    shldq %cl, %r9, %rbx
-; SSE-NEXT:    orq %r11, %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    notq %r11
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    orq %rbx, %r11
-; SSE-NEXT:    notq %r13
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rax, %r13
-; SSE-NEXT:    notq %r15
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    movq 16(%rsp,%r10), %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    orq %rdx, %r15
-; SSE-NEXT:    notq %r14
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shldq %cl, %rax, %r9
-; SSE-NEXT:    orq %r9, %r14
-; SSE-NEXT:    orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    movq %rax, 48(%rdi)
-; SSE-NEXT:    movq %rsi, 56(%rdi)
-; SSE-NEXT:    movq %rbp, 32(%rdi)
-; SSE-NEXT:    movq %r12, 40(%rdi)
-; SSE-NEXT:    movq %r11, 16(%rdi)
-; SSE-NEXT:    movq %r13, 24(%rdi)
-; SSE-NEXT:    movq %r15, (%rdi)
-; SSE-NEXT:    movq %r14, 8(%rdi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    addq $216, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: init_eq_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $200, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %r8d
-; AVX2-NEXT:    andl $63, %r8d
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rsi
-; AVX2-NEXT:    movq 144(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq 152(%rsp,%rsi), %r12
-; AVX2-NEXT:    movq %r12, %r10
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %r11, %r10
-; AVX2-NEXT:    movq 176(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq 184(%rsp,%rsi), %r9
-; AVX2-NEXT:    shldq %cl, %r14, %r9
-; AVX2-NEXT:    movq 160(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq 168(%rsp,%rsi), %r13
-; AVX2-NEXT:    movq %r13, %rbx
-; AVX2-NEXT:    shldq %cl, %r15, %rbx
-; AVX2-NEXT:    movq 128(%rsp,%rsi), %rbp
-; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 136(%rsp,%rsi), %rax
-; AVX2-NEXT:    shldq %cl, %rax, %r11
-; AVX2-NEXT:    shldq %cl, %r13, %r14
-; AVX2-NEXT:    shldq %cl, %r12, %r15
-; AVX2-NEXT:    shldq %cl, %rbp, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl %edx, %edx
-; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, (%rsp)
-; AVX2-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq 48(%rdi), %rbp
-; AVX2-NEXT:    movq 32(%rdi), %r13
-; AVX2-NEXT:    andnq %r13, %r15, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r15, %r13
-; AVX2-NEXT:    andnq %rbp, %r14, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r14, %rbp
-; AVX2-NEXT:    andnq %r12, %r11, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    andnq %rax, %rbx, %rcx
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq %rax, %rbp
-; AVX2-NEXT:    andq %rbx, %rbp
-; AVX2-NEXT:    movq 56(%rdi), %rcx
-; AVX2-NEXT:    andnq %rcx, %r9, %rbx
-; AVX2-NEXT:    andq %r9, %rcx
-; AVX2-NEXT:    movq 24(%rdi), %rax
-; AVX2-NEXT:    andnq %rax, %r10, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r10, %rax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    movq (%rdi), %r10
-; AVX2-NEXT:    andnq %r10, %rcx, %r15
-; AVX2-NEXT:    andq %rcx, %r10
-; AVX2-NEXT:    movq 40(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq 48(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq %r11, %r9
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %rdx, %r9
-; AVX2-NEXT:    orq %r13, %r10
-; AVX2-NEXT:    orq %r12, %r10
-; AVX2-NEXT:    movq 8(%rdi), %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    andnq %r13, %rcx, %r12
-; AVX2-NEXT:    andq %rcx, %r13
-; AVX2-NEXT:    orq %rbp, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq 56(%rsp,%rsi), %rax
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %r11, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    orq %r9, %r14
-; AVX2-NEXT:    orq %rax, %rbx
-; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 24(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq 32(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, %r11
-; AVX2-NEXT:    shldq %cl, %rax, %r11
-; AVX2-NEXT:    shldq %cl, %r9, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT:    orq %r11, %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    orq %rdx, %rbx
-; AVX2-NEXT:    movq 8(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq 16(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, %r11
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    shldq %cl, %r9, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    orq %r11, %r9
-; AVX2-NEXT:    movq (%rsp,%rsi), %rsi
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    shlxq %r8, %rsi, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    orq %rax, %r15
-; AVX2-NEXT:    orq %rdx, %r12
-; AVX2-NEXT:    orq %r10, %r13
-; AVX2-NEXT:    movq %r14, 48(%rdi)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    movq %rax, 56(%rdi)
-; AVX2-NEXT:    movq %rbp, 32(%rdi)
-; AVX2-NEXT:    movq %rbx, 40(%rdi)
-; AVX2-NEXT:    movq %r9, 16(%rdi)
-; AVX2-NEXT:    movq %r11, 24(%rdi)
-; AVX2-NEXT:    movq %r15, (%rdi)
-; AVX2-NEXT:    movq %r12, 8(%rdi)
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    addq $200, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $184, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rsi
-; AVX512-NEXT:    movq 128(%rsp,%rsi), %r10
-; AVX512-NEXT:    movq 136(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rax
-; AVX512-NEXT:    shldq %cl, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 160(%rsp,%rsi), %r14
-; AVX512-NEXT:    movq 168(%rsp,%rsi), %rax
-; AVX512-NEXT:    shldq %cl, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 144(%rsp,%rsi), %r15
-; AVX512-NEXT:    movq 152(%rsp,%rsi), %r11
-; AVX512-NEXT:    movq %r11, %rbx
-; AVX512-NEXT:    shldq %cl, %r15, %rbx
-; AVX512-NEXT:    movq 120(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rax, %r10
-; AVX512-NEXT:    shldq %cl, %r11, %r14
-; AVX512-NEXT:    movq %rdi, %r9
-; AVX512-NEXT:    movq 112(%rsp,%rsi), %r11
-; AVX512-NEXT:    shldq %cl, %r12, %r15
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq 48(%rdi), %r13
-; AVX512-NEXT:    movq 32(%rdi), %rbp
-; AVX512-NEXT:    andnq %rbp, %r15, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r15, %rbp
-; AVX512-NEXT:    andnq %r13, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r14, %r13
-; AVX512-NEXT:    andnq %r12, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r10, %r12
-; AVX512-NEXT:    movq 40(%rdi), %r8
-; AVX512-NEXT:    orq %r13, %r12
-; AVX512-NEXT:    andnq %r8, %rbx, %rdi
-; AVX512-NEXT:    andq %rbx, %r8
-; AVX512-NEXT:    movq 56(%r9), %r13
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    andnq %r13, %rdx, %r10
-; AVX512-NEXT:    andq %rdx, %r13
-; AVX512-NEXT:    movq 24(%r9), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    andnq %rax, %rdx, %r15
-; AVX512-NEXT:    andq %rdx, %rax
-; AVX512-NEXT:    orq %r13, %rax
-; AVX512-NEXT:    shlxq %rcx, %r11, %r13
-; AVX512-NEXT:    movq (%r9), %rdx
-; AVX512-NEXT:    andnq %rdx, %r13, %r14
-; AVX512-NEXT:    andq %r13, %rdx
-; AVX512-NEXT:    orq %rbp, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r11, %rbp
-; AVX512-NEXT:    orq %r12, %rdx
-; AVX512-NEXT:    movq 8(%r9), %r13
-; AVX512-NEXT:    andnq %r13, %rbp, %rbx
-; AVX512-NEXT:    andq %rbp, %r13
-; AVX512-NEXT:    orq %r8, %r13
-; AVX512-NEXT:    movq 24(%rsp,%rsi), %r8
-; AVX512-NEXT:    orq %rax, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, %r12
-; AVX512-NEXT:    shldq %cl, %r8, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    orq %r12, %r11
-; AVX512-NEXT:    movq 40(%rsp,%rsi), %r12
-; AVX512-NEXT:    shldq %cl, %rax, %r12
-; AVX512-NEXT:    orq %r12, %r10
-; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 8(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq 16(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rbp
-; AVX512-NEXT:    shldq %cl, %rax, %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    orq %rbp, %r10
-; AVX512-NEXT:    shldq %cl, %r12, %r8
-; AVX512-NEXT:    orq %r8, %rdi
-; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq -8(%rsp,%rsi), %r8
-; AVX512-NEXT:    movq (%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rbp
-; AVX512-NEXT:    shldq %cl, %r8, %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    orq %rbp, %rdi
-; AVX512-NEXT:    movq -16(%rsp,%rsi), %rsi
-; AVX512-NEXT:    shldq %cl, %r12, %rax
-; AVX512-NEXT:    orq %rax, %r15
-; AVX512-NEXT:    shlxq %rcx, %rsi, %rax
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rsi, %r8
-; AVX512-NEXT:    orq %rax, %r14
-; AVX512-NEXT:    orq %r8, %rbx
-; AVX512-NEXT:    orq %rdx, %r13
-; AVX512-NEXT:    movq %r11, 48(%r9)
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, 56(%r9)
-; AVX512-NEXT:    movq %r10, 32(%r9)
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, 40(%r9)
-; AVX512-NEXT:    movq %rdi, 16(%r9)
-; AVX512-NEXT:    movq %r15, 24(%r9)
-; AVX512-NEXT:    movq %r14, (%r9)
-; AVX512-NEXT:    movq %rbx, 8(%r9)
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    addq $184, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
-  %rem = and i32 %position, 511
-  %ofs = zext nneg i32 %rem to i512
-  %bit = shl nuw i512 1, %ofs
-  %mask = xor i512 %bit, -1
-  %val0 = zext i1 %value to i512
-  %val = shl nuw i512 %val0, %ofs
+  %mask = xor i512 %bit, -1
+  %val0 = zext i1 %value to i512
+  %val = shl nuw i512 %val0, %ofs
   %ld = load i512, ptr %word
   %test = and i512 %ld, %bit
   %res0 = and i512 %ld, %mask
   %res = or i512 %res0, %val
-  %cmp = icmp eq i512 %test, 0
-  store i512 %res, ptr %word
-  ret i1 %cmp
-}
-
-; i4096
-
-define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
-; X86-LABEL: test_ne_i4096:
-; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $1792, %esp # imm = 0x700
-; X86-NEXT:    movl 12(%ebp), %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    andl $508, %ecx # imm = 0x1FC
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 248(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 252(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 504(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 508(%esi), %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 120(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 124(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 376(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 380(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 184(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 188(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 440(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 444(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 312(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 316(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 216(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 220(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 472(%esi), %edi
-; X86-NEXT:    movl 476(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 344(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 348(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 152(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 156(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 408(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 412(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 280(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 284(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 232(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 236(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 488(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 492(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 104(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 108(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 360(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 364(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 168(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 172(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 424(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 428(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 296(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 300(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 200(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 204(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 456(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 460(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 328(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 332(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 140(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 392(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 396(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 264(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 268(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 240(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 244(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 496(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 500(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 112(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 116(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 368(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 372(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 176(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 180(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 432(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 436(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 304(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 308(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 208(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 212(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 464(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 468(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 336(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 340(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 144(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 148(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 400(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 404(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 272(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 276(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 224(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 228(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 480(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 484(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 100(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 352(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 356(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 160(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 164(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 416(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 420(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 288(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 292(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 192(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 196(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 448(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 452(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 64(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 320(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 324(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 132(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl 256(%esi), %edi
-; X86-NEXT:    movl 260(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 388(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 4(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $1, %eax, %edi
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    shrdl %cl, %eax, %edi
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    movb $32, %cl
-; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    jne .LBB20_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:  .LBB20_2:
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 320(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 64(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 448(%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 192(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 288(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 32(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 416(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 160(%eax), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 352(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 96(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 480(%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 224(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 272(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 16(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 400(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 144(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 336(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 80(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 464(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 208(%eax), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 304(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 48(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 432(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 176(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 368(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 112(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 496(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    andl 240(%eax), %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 264(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 8(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 392(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 136(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 328(%ebx), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 72(%ebx), %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 456(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 200(%ebx), %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 296(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 40(%ebx), %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 424(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 168(%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 360(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 104(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 488(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 232(%ebx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 280(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 24(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 408(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 152(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 344(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 88(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 472(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 216(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 312(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 56(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 440(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 184(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 376(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 120(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 504(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 248(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 324(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 68(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 452(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 196(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 292(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 36(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 420(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 164(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 356(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 100(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 484(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 228(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 276(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 20(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 404(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 148(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 340(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 84(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 468(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 212(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 308(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 52(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 436(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 180(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 372(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 116(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 500(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 244(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 268(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 12(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 396(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 140(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 332(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 76(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 460(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 204(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 300(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 44(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 428(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 172(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 364(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 108(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 492(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 236(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 284(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 28(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 412(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 156(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 348(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 92(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 476(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 220(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 316(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 60(%ebx), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 444(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 188(%ebx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 380(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 124(%ebx), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 508(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    andl 252(%esi), %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl 1648(%esp,%ecx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    andl 128(%edx), %ecx
-; X86-NEXT:    andl 384(%edx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    andl (%edx), %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 256(%edx), %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 260(%edx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 4(%edx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 132(%edx), %eax
-; X86-NEXT:    andl 388(%edx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+  %cmp = icmp eq i512 %test, 0
+  store i512 %res, ptr %word
+  ret i1 %cmp
+}
+
+; i4096
+
+define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
+; X86-LABEL: test_ne_i4096:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $4064, %edx # imm = 0xFE0
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: test_ne_i4096:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $1576, %rsp # imm = 0x628
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl %esi, %eax
-; SSE-NEXT:    andl $4032, %eax # imm = 0xFC0
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    negl %eax
-; SSE-NEXT:    movslq %eax, %rsi
-; SSE-NEXT:    movq 1296(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1304(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1552(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1560(%rsp,%rsi), %rax
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1168(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1176(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1424(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1432(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1232(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1240(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1488(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1496(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1104(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1112(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1360(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; SSE-NEXT:    movq 1368(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1272(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1528(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1144(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1400(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1200(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1208(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1464(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1072(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1080(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1336(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1288(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1544(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1160(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1416(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1216(%rsp,%rsi), %r11
-; SSE-NEXT:    movq 1224(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %r11, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1480(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1088(%rsp,%rsi), %r9
-; SSE-NEXT:    movq 1096(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %r9, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1352(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1248(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1256(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1512(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1120(%rsp,%rsi), %rax
-; SSE-NEXT:    movq 1128(%rsp,%rsi), %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rax, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1376(%rsp,%rsi), %r13
-; SSE-NEXT:    movq 1384(%rsp,%rsi), %rbx
-; SSE-NEXT:    movq %rbx, %r8
-; SSE-NEXT:    shldq %cl, %r13, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1192(%rsp,%rsi), %r15
-; SSE-NEXT:    movq %r15, %r14
-; SSE-NEXT:    shldq %cl, %rdx, %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1440(%rsp,%rsi), %r10
-; SSE-NEXT:    movq 1448(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, %r14
-; SSE-NEXT:    shldq %cl, %r10, %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1312(%rsp,%rsi), %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1320(%rsp,%rsi), %rbp
-; SSE-NEXT:    movq %rbp, %r12
-; SSE-NEXT:    shldq %cl, %r14, %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq 1064(%rsp,%rsi), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rbp, %r14
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r9
-; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rbp
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r13
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r12, %r15
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r12, %r10
-; SSE-NEXT:    andq 384(%rdi), %r10
-; SSE-NEXT:    andq 128(%rdi), %r15
-; SSE-NEXT:    andq 320(%rdi), %r13
-; SSE-NEXT:    andq 64(%rdi), %rax
-; SSE-NEXT:    orq %r10, %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    andq 448(%rdi), %r9
-; SSE-NEXT:    andq 192(%rdi), %rbp
-; SSE-NEXT:    orq %r9, %rbp
-; SSE-NEXT:    orq %rax, %rbp
-; SSE-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq 288(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 32(%rdi), %r9
-; SSE-NEXT:    andq 416(%rdi), %rdx
-; SSE-NEXT:    andq 160(%rdi), %r11
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 352(%rdi), %rdx
-; SSE-NEXT:    orq %r9, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 96(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 480(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 224(%rdi), %r8
-; SSE-NEXT:    orq %rax, %r8
-; SSE-NEXT:    orq %rdx, %r8
-; SSE-NEXT:    andq 272(%rdi), %r14
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 16(%rdi), %rax
-; SSE-NEXT:    orq %r14, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 400(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 144(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 336(%rdi), %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 80(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 464(%rdi), %rdx
-; SSE-NEXT:    orq %r9, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 208(%rdi), %r11
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    orq %rax, %r11
-; SSE-NEXT:    orq %r8, %r11
-; SSE-NEXT:    movq (%rsp), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 304(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 48(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 432(%rdi), %r9
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 176(%rdi), %r8
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 368(%rdi), %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 112(%rdi), %rax
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    movq %r8, %r10
-; SSE-NEXT:    orq %r9, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 496(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT:    andq 240(%rdi), %rbp
-; SSE-NEXT:    orq %r8, %rbp
-; SSE-NEXT:    orq %rax, %rbp
-; SSE-NEXT:    orq %r10, %rbp
-; SSE-NEXT:    orq %r11, %rbp
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 392(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    andq 136(%rdi), %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 328(%rdi), %rdx
-; SSE-NEXT:    orq %rax, %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 72(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 456(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE-NEXT:    andq 200(%rdi), %r13
-; SSE-NEXT:    orq %rax, %r13
-; SSE-NEXT:    orq %rdx, %r13
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 296(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 40(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 424(%rdi), %r8
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 168(%rdi), %rdx
-; SSE-NEXT:    orq %r8, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 360(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 104(%rdi), %rax
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 488(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    andq 232(%rdi), %r15
-; SSE-NEXT:    orq %rax, %r15
-; SSE-NEXT:    orq %r8, %r15
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 280(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 24(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %r15
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 408(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 152(%rdi), %rax
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 344(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 88(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 472(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT:    andq 216(%rdi), %r14
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    orq %rax, %r14
-; SSE-NEXT:    orq %r8, %r14
-; SSE-NEXT:    orq %r10, %r14
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 312(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT:    andq 56(%rdi), %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 440(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 184(%rdi), %r9
-; SSE-NEXT:    orq %r11, %r10
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    orq %r10, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT:    andq 376(%rdi), %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 120(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 504(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 248(%rdi), %r8
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    movq 1056(%rsp,%rsi), %rax
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    andq 256(%rdi), %rdx
-; SSE-NEXT:    orq %r14, %r8
-; SSE-NEXT:    andq (%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbp, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    andq 264(%rdi), %rcx
-; SSE-NEXT:    andq 8(%rdi), %rbx
-; SSE-NEXT:    orq %rcx, %rbx
-; SSE-NEXT:    orq %r12, %rbx
-; SSE-NEXT:    orq %r13, %rbx
-; SSE-NEXT:    orq %r15, %rbx
-; SSE-NEXT:    orq %r8, %rbx
-; SSE-NEXT:    orq %rax, %rbx
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $1576, %rsp # imm = 0x628
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_ne_i4096:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $1560, %rsp # imm = 0x618
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl %esi, %eax
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    andl $4032, %eax # imm = 0xFC0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    movslq %eax, %rsi
-; AVX2-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1288(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1544(%rsp,%rsi), %rax
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1160(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1416(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1216(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; AVX2-NEXT:    movq 1224(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1480(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1088(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1096(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1352(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1248(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1256(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1512(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1120(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1128(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1376(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1384(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1192(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1440(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1448(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1056(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1064(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1312(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1320(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1272(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1528(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1144(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1400(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1200(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq 1208(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %r11, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1464(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1072(%rsp,%rsi), %r12
-; AVX2-NEXT:    movq 1080(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %r12, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1336(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1232(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1240(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1488(%rsp,%rsi), %rbp
-; AVX2-NEXT:    movq 1496(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rbp, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1104(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq 1112(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1360(%rsp,%rsi), %r10
-; AVX2-NEXT:    movq 1368(%rsp,%rsi), %r8
-; AVX2-NEXT:    movq %r8, %rdx
-; AVX2-NEXT:    shldq %cl, %r10, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1168(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1176(%rsp,%rsi), %rbx
-; AVX2-NEXT:    movq %rbx, %rdx
-; AVX2-NEXT:    shldq %cl, %r9, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1424(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq 1432(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, %r14
-; AVX2-NEXT:    shldq %cl, %r9, %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1296(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1304(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq %r14, %r13
-; AVX2-NEXT:    shldq %cl, %r15, %r13
-; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq 1048(%rsp,%rsi), %rdx
-; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, %r9
-; AVX2-NEXT:    andq 384(%rdi), %r9
-; AVX2-NEXT:    andq 128(%rdi), %r14
-; AVX2-NEXT:    andq 320(%rdi), %r10
-; AVX2-NEXT:    orq %r9, %r14
-; AVX2-NEXT:    movq %r14, %r15
-; AVX2-NEXT:    andq 64(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    andq 448(%rdi), %rbp
-; AVX2-NEXT:    andq 192(%rdi), %r13
-; AVX2-NEXT:    orq %rbp, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq 288(%rdi), %r8
-; AVX2-NEXT:    andq 32(%rdi), %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 416(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %r12
-; AVX2-NEXT:    andq 160(%rdi), %r11
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    andq 352(%rdi), %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 96(%rdi), %rax
-; AVX2-NEXT:    orq %r12, %r11
-; AVX2-NEXT:    orq %rbx, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 480(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    andq 224(%rdi), %r13
-; AVX2-NEXT:    orq %r10, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 272(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 16(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r13
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 400(%rdi), %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 144(%rdi), %rax
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 336(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 80(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 464(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 208(%rdi), %r11
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    orq %r8, %r11
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    orq %r9, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 304(%rdi), %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 48(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 432(%rdi), %r10
-; AVX2-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 176(%rdi), %rax
-; AVX2-NEXT:    orq %r9, %r8
-; AVX2-NEXT:    movq %r8, %r9
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 368(%rdi), %r8
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 112(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 496(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 240(%rdi), %r9
-; AVX2-NEXT:    orq %r8, %r9
-; AVX2-NEXT:    orq %rax, %r9
-; AVX2-NEXT:    orq %r10, %r9
-; AVX2-NEXT:    orq %r11, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 392(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT:    andq 136(%rdi), %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 328(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 72(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rbp
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 456(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX2-NEXT:    andq 200(%rdi), %r12
-; AVX2-NEXT:    orq %rax, %r12
-; AVX2-NEXT:    orq %r8, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 296(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 40(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 424(%rdi), %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 168(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %r8
-; AVX2-NEXT:    movq %r8, %r10
-; AVX2-NEXT:    orq %r11, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 360(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 104(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 488(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    andq 232(%rdi), %r14
-; AVX2-NEXT:    orq %rax, %r14
-; AVX2-NEXT:    orq %r8, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 280(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 24(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 408(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 152(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 344(%rdi), %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 88(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 472(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    andq 216(%rdi), %rbx
-; AVX2-NEXT:    orq %rax, %rbx
-; AVX2-NEXT:    orq %r8, %rbx
-; AVX2-NEXT:    orq %r10, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 312(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 56(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 440(%rdi), %r10
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 184(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 376(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 120(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r8
-; AVX2-NEXT:    movq %r8, %r11
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 504(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 248(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r8, %r10
-; AVX2-NEXT:    orq %r11, %rax
-; AVX2-NEXT:    movq 1040(%rsp,%rsi), %rsi
-; AVX2-NEXT:    orq %rbx, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    shlxq %rcx, %rsi, %rax
-; AVX2-NEXT:    andq 256(%rdi), %r10
-; AVX2-NEXT:    andq (%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    orq %r15, %rax
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT:    orq %r13, %rax
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    andq 264(%rdi), %rcx
-; AVX2-NEXT:    andq 8(%rdi), %rdx
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    orq %rbp, %rdx
-; AVX2-NEXT:    orq %r12, %rdx
-; AVX2-NEXT:    orq %r14, %rdx
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    orq %rax, %rdx
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $1560, %rsp # imm = 0x618
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_ne_i4096:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $1560, %rsp # imm = 0x618
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl %esi, %eax
-; AVX512-NEXT:    andl $4032, %eax # imm = 0xFC0
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %eax
-; AVX512-NEXT:    negl %eax
-; AVX512-NEXT:    movslq %eax, %rsi
-; AVX512-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1288(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1544(%rsp,%rsi), %rax
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1160(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1416(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1216(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    movq 1224(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1480(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1088(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1096(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1352(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1248(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1256(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1512(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1120(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1128(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1376(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1384(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1192(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1440(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1448(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1056(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1064(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1312(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1320(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1272(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1528(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1144(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1400(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1200(%rsp,%rsi), %r10
-; AVX512-NEXT:    movq 1208(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1464(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1072(%rsp,%rsi), %r14
-; AVX512-NEXT:    movq 1080(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1336(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1232(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1240(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1488(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq 1496(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %r12, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1104(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq 1112(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1360(%rsp,%rsi), %r11
-; AVX512-NEXT:    movq 1368(%rsp,%rsi), %rbx
-; AVX512-NEXT:    movq %rbx, %rdx
-; AVX512-NEXT:    shldq %cl, %r11, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1168(%rsp,%rsi), %r9
-; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1176(%rsp,%rsi), %r8
-; AVX512-NEXT:    movq %r8, %rdx
-; AVX512-NEXT:    shldq %cl, %r9, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1424(%rsp,%rsi), %r9
-; AVX512-NEXT:    movq 1432(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, %r15
-; AVX512-NEXT:    shldq %cl, %r9, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1296(%rsp,%rsi), %rbp
-; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1304(%rsp,%rsi), %r15
-; AVX512-NEXT:    movq %r15, %r13
-; AVX512-NEXT:    shldq %cl, %rbp, %r13
-; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq 1048(%rsp,%rsi), %rdx
-; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r13
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbp, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    andq 384(%rdi), %r9
-; AVX512-NEXT:    andq 128(%rdi), %r15
-; AVX512-NEXT:    orq %r9, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq 320(%rdi), %r11
-; AVX512-NEXT:    andq 64(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %rax
-; AVX512-NEXT:    andq 448(%rdi), %r12
-; AVX512-NEXT:    andq 192(%rdi), %r13
-; AVX512-NEXT:    orq %r12, %r13
-; AVX512-NEXT:    orq %rax, %r13
-; AVX512-NEXT:    andq 288(%rdi), %r8
-; AVX512-NEXT:    andq 32(%rdi), %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 416(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %r14
-; AVX512-NEXT:    andq 160(%rdi), %r10
-; AVX512-NEXT:    orq %rax, %r10
-; AVX512-NEXT:    andq 352(%rdi), %rbx
-; AVX512-NEXT:    orq %r14, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 96(%rdi), %rax
-; AVX512-NEXT:    orq %rbx, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 480(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    andq 224(%rdi), %r15
-; AVX512-NEXT:    orq %rax, %r15
-; AVX512-NEXT:    orq %r8, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 272(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 16(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 400(%rdi), %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 144(%rdi), %rax
-; AVX512-NEXT:    orq %r9, %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 336(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 80(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 464(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    andq 208(%rdi), %r11
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    orq %r8, %r11
-; AVX512-NEXT:    orq %rax, %r11
-; AVX512-NEXT:    orq %r9, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 304(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 48(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 432(%rdi), %r9
-; AVX512-NEXT:    movq (%rsp), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 176(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 368(%rdi), %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 112(%rdi), %rax
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    movq %r8, %r10
-; AVX512-NEXT:    orq %r9, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 496(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 240(%rdi), %r9
-; AVX512-NEXT:    orq %r8, %r9
-; AVX512-NEXT:    orq %rax, %r9
-; AVX512-NEXT:    orq %r10, %r9
-; AVX512-NEXT:    orq %r11, %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 392(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    andq 136(%rdi), %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 328(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 72(%rdi), %rax
-; AVX512-NEXT:    orq %r10, %rbp
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 456(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT:    andq 200(%rdi), %r12
-; AVX512-NEXT:    orq %rax, %r12
-; AVX512-NEXT:    orq %r8, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 296(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 40(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 424(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 168(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 360(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 104(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 488(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    andq 232(%rdi), %r14
-; AVX512-NEXT:    orq %rax, %r14
-; AVX512-NEXT:    orq %r8, %r14
-; AVX512-NEXT:    orq %r10, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 280(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 24(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 408(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 152(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    andq 344(%rdi), %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 88(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 472(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    andq 216(%rdi), %rbx
-; AVX512-NEXT:    orq %rax, %rbx
-; AVX512-NEXT:    orq %r8, %rbx
-; AVX512-NEXT:    orq %r10, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 312(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 56(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 440(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 184(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 376(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 120(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 504(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 248(%rdi), %r8
-; AVX512-NEXT:    orq %rax, %r8
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    orq %r11, %r8
-; AVX512-NEXT:    movq 1040(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rsi, %r10
-; AVX512-NEXT:    orq %rbx, %r8
-; AVX512-NEXT:    shlxq %rcx, %rax, %rsi
-; AVX512-NEXT:    andq 256(%rdi), %r10
-; AVX512-NEXT:    andq (%rdi), %rsi
-; AVX512-NEXT:    orq %r10, %rsi
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    orq %r13, %rsi
-; AVX512-NEXT:    orq %r15, %rsi
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    orq %r9, %rsi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 264(%rdi), %rax
-; AVX512-NEXT:    andq 8(%rdi), %rdx
-; AVX512-NEXT:    orq %rax, %rdx
-; AVX512-NEXT:    orq %rbp, %rdx
-; AVX512-NEXT:    orq %r12, %rdx
-; AVX512-NEXT:    orq %r14, %rdx
-; AVX512-NEXT:    orq %r8, %rdx
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $1560, %rsp # imm = 0x618
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: test_ne_i4096:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $4064, %eax # imm = 0xFE0
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %rem = and i32 %position, 4095
   %ofs = zext nneg i32 %rem to i4096
   %bit = shl nuw i4096 1, %ofs
@@ -7032,115 +906,46 @@ define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_cmpz_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $64, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %esi
-; X86-NEXT:    movl 36(%esp,%esi), %eax
-; X86-NEXT:    movl 40(%esp,%esi), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl 32(%esp,%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esp,%esi), %esi
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    xorl 12(%ecx), %esi
-; X86-NEXT:    xorl 8(%ecx), %edx
-; X86-NEXT:    xorl 4(%ecx), %eax
-; X86-NEXT:    xorl (%ecx), %edi
-; X86-NEXT:    movl %edx, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %edi, (%ecx)
-; X86-NEXT:    movl %eax, 4(%ecx)
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $1, %edx
+; X86-NEXT:    shll %cl, %edx
+; X86-NEXT:    andl $96, %ecx
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    xorl %edx, (%eax,%ecx)
+; X86-NEXT:    movl (%eax), %ecx
+; X86-NEXT:    movl 4(%eax), %edx
+; X86-NEXT:    orl 12(%eax), %edx
+; X86-NEXT:    orl 8(%eax), %ecx
+; X86-NEXT:    orl %edx, %ecx
 ; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: complement_cmpz_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
 ; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %rsi, %rax
-; SSE-NEXT:    xorq 8(%rdi), %rdx
-; SSE-NEXT:    xorq (%rdi), %rax
-; SSE-NEXT:    movq %rax, (%rdi)
-; SSE-NEXT:    movq %rdx, 8(%rdi)
-; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    shll %cl, %eax
+; SSE-NEXT:    andl $96, %ecx
+; SSE-NEXT:    shrl $3, %ecx
+; SSE-NEXT:    xorl %eax, (%rdi,%rcx)
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    orq 8(%rdi), %rax
 ; SSE-NEXT:    setne %al
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: complement_cmpz_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shlxq %rcx, %rax, %rax
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rax, %rdx
-; AVX2-NEXT:    cmovneq %rsi, %rax
-; AVX2-NEXT:    xorq 8(%rdi), %rdx
-; AVX2-NEXT:    xorq (%rdi), %rax
-; AVX2-NEXT:    movq %rax, (%rdi)
-; AVX2-NEXT:    movq %rdx, 8(%rdi)
-; AVX2-NEXT:    orq %rdx, %rax
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: complement_cmpz_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movl $1, %edx
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rdx, %rsi
-; AVX512-NEXT:    cmovneq %rax, %rdx
-; AVX512-NEXT:    xorq 8(%rdi), %rsi
-; AVX512-NEXT:    xorq (%rdi), %rdx
-; AVX512-NEXT:    movq %rdx, (%rdi)
-; AVX512-NEXT:    movq %rsi, 8(%rdi)
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    retq
+; AVX-LABEL: complement_cmpz_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    shlxl %esi, %eax, %eax
+; AVX-NEXT:    andl $96, %esi
+; AVX-NEXT:    shrl $3, %esi
+; AVX-NEXT:    xorl %eax, (%rdi,%rsi)
+; AVX-NEXT:    movq (%rdi), %rax
+; AVX-NEXT:    orq 8(%rdi), %rax
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -7155,14 +960,152 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
 define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-LABEL: reset_multiload_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    btrl %edx, %ebx
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl %ebx, (%ecx,%esi)
+; X86-NEXT:    jae .LBB22_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB22_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: reset_multiload_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %r9d
+; X64-NEXT:    movl %r9d, %r8d
+; X64-NEXT:    btrl %esi, %r8d
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    btl %esi, %r9d
+; X64-NEXT:    jb .LBB22_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:  .LBB22_2:
+; X64-NEXT:    movl %r8d, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %mask = xor i128 %bit, -1
+  %ld = load i128, ptr %word
+  %sel = load i32, ptr %p
+  %test = and i128 %ld, %bit
+  %res = and i128 %ld, %mask
+  %cmp = icmp eq i128 %test, 0
+  store i128 %res, ptr %word
+  %ret = select i1 %cmp, i32 %sel, i32 0
+  ret i32 %ret
+}
+
+; Multiple uses of the store chain AND stored value
+define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind {
+; X86-LABEL: chain_reset_i256:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $-2, %edi
+; X86-NEXT:    roll %cl, %edi
+; X86-NEXT:    shrl $3, %ecx
+; X86-NEXT:    andl $28, %ecx
+; X86-NEXT:    andl %edi, (%esi,%ecx)
+; X86-NEXT:    movl 8(%esi), %ebx
+; X86-NEXT:    movl (%esi), %edi
+; X86-NEXT:    movl 4(%esi), %ecx
+; X86-NEXT:    movl 12(%esi), %ebp
+; X86-NEXT:    orl 28(%esi), %ebp
+; X86-NEXT:    orl 20(%esi), %ecx
+; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    orl 24(%esi), %ebx
+; X86-NEXT:    movl 16(%esi), %ebp
+; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    movl (%edx), %esi
+; X86-NEXT:    movl %edi, (%edx)
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    orl %ecx, %ebp
+; X86-NEXT:    jne .LBB23_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    addl %esi, %eax
+; X86-NEXT:  .LBB23_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: chain_reset_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    movl $-2, %eax
+; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $28, %ecx
+; X64-NEXT:    andl %eax, (%rdi,%rcx)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq 8(%rdi), %r8
+; X64-NEXT:    orq 24(%rdi), %r8
+; X64-NEXT:    movq 16(%rdi), %rdi
+; X64-NEXT:    orq %rcx, %rdi
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    movl %ecx, (%rsi)
+; X64-NEXT:    movl (%rdx), %ecx
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    orq %r8, %rdi
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
+  %rem = and i32 %position, 255
+  %ofs = zext nneg i32 %rem to i256
+  %bit = shl nuw i256 1, %ofs
+  %ld0 = load i256, ptr %p0
+  %msk = xor i256 %bit, -1
+  %res = and i256 %ld0, %msk
+  store i256 %res, ptr %p0
+  %cmp = icmp ne i256 %res, 0
+  %ld1 = load i32, ptr %p1
+  %trunc = trunc i256 %res to i32
+  store i32 %trunc, ptr %p1
+  %ld2 = load i32, ptr %p2
+  %add = add i32 %ld1, %ld2
+  %sel = select i1 %cmp, i32 %ld2, i32 %add
+  ret i32 %sel
+}
+
+; BTC/BT/BTS sequence on same i128
+define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
+; X86-LABEL: sequence_i128:
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    subl $144, %esp
+; X86-NEXT:    movb 20(%ebp), %ch
+; X86-NEXT:    movb 12(%ebp), %cl
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -7176,54 +1119,80 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl 56(%esp,%eax), %edx
+; X86-NEXT:    movl 60(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl 8(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    movl (%ebx), %esi
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 84(%esp,%eax), %edx
+; X86-NEXT:    movl 88(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl 20(%ebp), %ecx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl 12(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 4(%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl 92(%esp,%eax), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl 8(%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl 12(%eax), %esi
+; X86-NEXT:    xorl (%eax), %edi
+; X86-NEXT:    xorl 4(%eax), %ebx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %ebx, 8(%esi)
-; X86-NEXT:    movl %ecx, 12(%esi)
-; X86-NEXT:    movl %edi, (%esi)
-; X86-NEXT:    movl %edx, 4(%esi)
-; X86-NEXT:    je .LBB22_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB22_2:
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    andb $96, %al
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 96(%esp,%eax), %eax
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setae %al
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl %edi, (%ecx)
+; X86-NEXT:    movl %ebx, 4(%ecx)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -7231,73 +1200,129 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: reset_multiload_i128:
+; SSE-LABEL: sequence_i128:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movl %ecx, %eax
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %esi
-; SSE-NEXT:    xorl %r8d, %r8d
-; SSE-NEXT:    shldq %cl, %rsi, %r8
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    shlq %cl, %rsi
+; SSE-NEXT:    movl $1, %r8d
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    shldq %cl, %r8, %rsi
+; SSE-NEXT:    movl $1, %r9d
+; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    xorl %r11d, %r11d
 ; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rsi, %r8
-; SSE-NEXT:    cmovneq %rax, %rsi
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %r9
-; SSE-NEXT:    movq %r9, %r10
-; SSE-NEXT:    andq %r8, %r10
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    movq %rcx, %r11
-; SSE-NEXT:    andq %rsi, %r11
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    andq %r9, %r8
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    orq %r10, %r11
-; SSE-NEXT:    jne .LBB22_2
-; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    movl (%rdx), %eax
-; SSE-NEXT:  .LBB22_2:
-; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    movq %r8, 8(%rdi)
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    cmovneq %r9, %rsi
+; SSE-NEXT:    cmovneq %r11, %r9
+; SSE-NEXT:    xorl %r10d, %r10d
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shldq %cl, %r8, %r10
+; SSE-NEXT:    shlq %cl, %r8
+; SSE-NEXT:    testb $64, %al
+; SSE-NEXT:    cmovneq %r8, %r10
+; SSE-NEXT:    cmovneq %r11, %r8
+; SSE-NEXT:    xorq 8(%rdi), %rsi
+; SSE-NEXT:    xorq (%rdi), %r9
+; SSE-NEXT:    movl %edx, %ecx
+; SSE-NEXT:    andb $32, %cl
+; SSE-NEXT:    movq %r9, %rax
+; SSE-NEXT:    shrdq %cl, %rsi, %rax
+; SSE-NEXT:    movq %rsi, %r11
+; SSE-NEXT:    shrq %cl, %r11
+; SSE-NEXT:    testb $64, %dl
+; SSE-NEXT:    cmoveq %rax, %r11
+; SSE-NEXT:    btl %edx, %r11d
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    orq %r10, %rsi
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    movq %r9, (%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: reset_multiload_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    movl $1, %esi
-; AVX-NEXT:    xorl %r8d, %r8d
-; AVX-NEXT:    shldq %cl, %rsi, %r8
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    shlxq %rcx, %rsi, %r9
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %r9, %r8
-; AVX-NEXT:    cmovneq %rax, %r9
-; AVX-NEXT:    movq (%rdi), %r10
-; AVX-NEXT:    movq 8(%rdi), %r11
-; AVX-NEXT:    andnq %r11, %r8, %rcx
-; AVX-NEXT:    andq %r8, %r11
-; AVX-NEXT:    andnq %r10, %r9, %rsi
-; AVX-NEXT:    andq %r9, %r10
-; AVX-NEXT:    orq %r11, %r10
-; AVX-NEXT:    jne .LBB22_2
-; AVX-NEXT:  # %bb.1:
-; AVX-NEXT:    movl (%rdx), %eax
-; AVX-NEXT:  .LBB22_2:
-; AVX-NEXT:    movq %rsi, (%rdi)
-; AVX-NEXT:    movq %rcx, 8(%rdi)
-; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX-NEXT:    retq
-  %rem = and i32 %position, 127
-  %ofs = zext nneg i32 %rem to i128
-  %bit = shl nuw i128 1, %ofs
-  %mask = xor i128 %bit, -1
+; AVX2-LABEL: sequence_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    movl $1, %r10d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shldq %cl, %r10, %rsi
+; AVX2-NEXT:    shlxq %rcx, %r10, %r8
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %r8, %rsi
+; AVX2-NEXT:    cmovneq %r9, %r8
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shldq %cl, %r10, %r11
+; AVX2-NEXT:    shlxq %rax, %r10, %r10
+; AVX2-NEXT:    testb $64, %al
+; AVX2-NEXT:    cmovneq %r10, %r11
+; AVX2-NEXT:    cmovneq %r9, %r10
+; AVX2-NEXT:    xorq 8(%rdi), %rsi
+; AVX2-NEXT:    xorq (%rdi), %r8
+; AVX2-NEXT:    movl %edx, %ecx
+; AVX2-NEXT:    andb $32, %cl
+; AVX2-NEXT:    movq %r8, %rax
+; AVX2-NEXT:    shrdq %cl, %rsi, %rax
+; AVX2-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT:    testb $64, %dl
+; AVX2-NEXT:    cmoveq %rax, %rcx
+; AVX2-NEXT:    btl %edx, %ecx
+; AVX2-NEXT:    setae %al
+; AVX2-NEXT:    orq %r11, %rsi
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    movq %r8, (%rdi)
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sequence_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %ecx, %eax
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    movl $1, %r9d
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shldq %cl, %r9, %rsi
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    shlxq %rcx, %r9, %r8
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %r8, %rsi
+; AVX512-NEXT:    cmovneq %r10, %r8
+; AVX512-NEXT:    xorl %r11d, %r11d
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shldq %cl, %r9, %r11
+; AVX512-NEXT:    shlxq %rax, %r9, %r9
+; AVX512-NEXT:    testb $64, %al
+; AVX512-NEXT:    cmovneq %r9, %r11
+; AVX512-NEXT:    cmovneq %r10, %r9
+; AVX512-NEXT:    xorq 8(%rdi), %rsi
+; AVX512-NEXT:    xorq (%rdi), %r8
+; AVX512-NEXT:    movl %edx, %ecx
+; AVX512-NEXT:    andb $32, %cl
+; AVX512-NEXT:    movq %r8, %rax
+; AVX512-NEXT:    shrdq %cl, %rsi, %rax
+; AVX512-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX512-NEXT:    testb $64, %dl
+; AVX512-NEXT:    cmoveq %rax, %rcx
+; AVX512-NEXT:    btl %edx, %ecx
+; AVX512-NEXT:    setae %al
+; AVX512-NEXT:    orq %r11, %rsi
+; AVX512-NEXT:    orq %r9, %r8
+; AVX512-NEXT:    movq %r8, (%rdi)
+; AVX512-NEXT:    movq %rsi, 8(%rdi)
+; AVX512-NEXT:    retq
+  %rem0 = and i32 %pos0, 127
+  %rem1 = and i32 %pos1, 127
+  %rem2 = and i32 %pos2, 127
+  %ofs0 = zext nneg i32 %rem0 to i128
+  %ofs1 = zext nneg i32 %rem1 to i128
+  %ofs2 = zext nneg i32 %rem2 to i128
+  %bit0 = shl nuw i128 1, %ofs0
+  %bit1 = shl nuw i128 1, %ofs1
+  %bit2 = shl nuw i128 1, %ofs2
   %ld = load i128, ptr %word
-  %sel = load i32, ptr %p
-  %test = and i128 %ld, %bit
-  %res = and i128 %ld, %mask
-  %cmp = icmp eq i128 %test, 0
-  store i128 %res, ptr %word
-  %ret = select i1 %cmp, i32 %sel, i32 0
-  ret i32 %ret
+  %res0 = xor i128 %ld, %bit0
+  %test1 = and i128 %res0, %bit1
+  %cmp1 = icmp eq i128 %test1, 0
+  %res2 = or i128 %res0, %bit2
+  store i128 %res2, ptr %word
+  ret i1 %cmp1
 }
diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll
index feac3dcad243a..30f1874c51fed 100644
--- a/llvm/test/CodeGen/X86/gfni-shifts.ll
+++ b/llvm/test/CodeGen/X86/gfni-shifts.ll
@@ -1684,15 +1684,14 @@ define <64 x i8> @var_shl_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_shl_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -1876,15 +1875,15 @@ define <64 x i8> @var_lshr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_lshr_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 {%k1}
+; GFNIAVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; GFNIAVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; GFNIAVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm3, %zmm2, %zmm2
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; GFNIAVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
   ret <64 x i8> %shift
@@ -2232,36 +2231,16 @@ define <64 x i8> @var_ashr_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; GFNIAVX512BW-LABEL: var_ashr_v64i8:
 ; GFNIAVX512BW:       # %bb.0:
-; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm5
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm5, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm4
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm4
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
-; GFNIAVX512BW-NEXT:    vpsllw $2, %zmm1, %zmm1
-; GFNIAVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; GFNIAVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm2
+; GFNIAVX512BW-NEXT:    vpsravw %zmm2, %zmm0, %zmm2
+; GFNIAVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; GFNIAVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsraw $8, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; GFNIAVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; GFNIAVX512BW-NEXT:    kmovq %rax, %k1
+; GFNIAVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; GFNIAVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; GFNIAVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
index 0fb0420bb2609..aff2228c258b5 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll
@@ -106,36 +106,16 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpsraw $4, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsraw $2, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm4, %zmm4, %zmm5
-; AVX512BW-NEXT:    vpmovb2m %zmm5, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsraw $1, %zmm2, %zmm3
-; AVX512BW-NEXT:    vpsllw $2, %zmm4, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm2 {%k1}
-; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpsraw $4, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsraw $2, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpaddw %zmm1, %zmm1, %zmm4
-; AVX512BW-NEXT:    vpmovb2m %zmm4, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsraw $1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vpsllw $2, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm3, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsravw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsraw $8, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-NEXT:    retq
   %shift = ashr <64 x i8> %a, %b
   ret <64 x i8> %shift
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index 103d5702fb93a..4450d07e01cca 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -85,20 +85,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm1, %zmm3
+; AVX512BW-NEXT:    vpandq %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpsrlvw %zmm3, %zmm2, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = lshr <64 x i8> %a, %b
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
index efd742956ed09..41238acc4b74d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll
@@ -82,19 +82,14 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
 ;
 ; AVX512BW-LABEL: var_shift_v64i8:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
-; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm2
+; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    movabsq $6148914691236517205, %rax # imm = 0x5555555555555555
+; AVX512BW-NEXT:    kmovq %rax, %k1
 ; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
-; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
-; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
-; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
 ; AVX512BW-NEXT:    retq
   %shift = shl <64 x i8> %a, %b
   ret <64 x i8> %shift
diff --git a/llvm/test/DebugInfo/debug-bool-const-value.ll b/llvm/test/DebugInfo/debug-bool-const-value.ll
new file mode 100644
index 0000000000000..84cf993cf4aae
--- /dev/null
+++ b/llvm/test/DebugInfo/debug-bool-const-value.ll
@@ -0,0 +1,29 @@
+; REQUIRES: object-emission
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-dwarfdump - | FileCheck %s
+
+; CHECK: {{.*}}DW_TAG_variable
+; CHECK-NEXT: {{.*}} DW_AT_const_value     (1)
+; CHECK-NEXT: {{.*}} DW_AT_name    ("arg")
+
+define void @test() !dbg !5
+{
+entry:
+  call void @"llvm.dbg.value"(metadata i1 true, metadata !7, metadata !8), !dbg !6
+  ret void, !dbg !6
+}
+
+declare void @"llvm.dbg.value"(metadata %".1", metadata %".2", metadata %".3")
+
+!llvm.dbg.cu = !{ !2 }
+!llvm.module.flags = !{ !9, !10 }
+
+!1 = !DIFile(directory: "", filename: "test")
+!2 = distinct !DICompileUnit(emissionKind: FullDebug, file: !1, isOptimized: false, language: DW_LANG_C_plus_plus, runtimeVersion: 0)
+!3 = !DIBasicType(encoding: DW_ATE_boolean, name: "bool", size: 8)
+!4 = !DISubroutineType(types: !{null})
+!5 = distinct !DISubprogram(file: !1, isDefinition: true, isLocal: false, isOptimized: false, line: 5, linkageName: "test", name: "test", scope: !1, scopeLine: 5, type: !4, unit: !2)
+!6 = !DILocation(column: 1, line: 5, scope: !5)
+!7 = !DILocalVariable(arg: 0, file: !1, line: 5, name: "arg", scope: !5, type: !3)
+!8 = !DIExpression()
+!9 = !{ i32 2, !"Dwarf Version", i32 4 }
+!10 = !{ i32 2, !"Debug Info Version", i32 3 }
diff --git a/llvm/test/Feature/intrinsics.ll b/llvm/test/Feature/intrinsics.ll
index b6abc0fff6db7..a2da8f29116e1 100644
--- a/llvm/test/Feature/intrinsics.ll
+++ b/llvm/test/Feature/intrinsics.ll
@@ -64,7 +64,7 @@ define void @libm() {
 
 ; FIXME: test ALL the intrinsics in this file.
 
-; CHECK: declare void @llvm.trap() #1
+; CHECK: declare void @llvm.trap() #2
 declare void @llvm.trap()
 
 define void @trap() {
@@ -72,5 +72,4 @@ define void @trap() {
   ret void
 }
 
-; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; CHECK: attributes #1 = { cold noreturn nounwind memory(inaccessiblemem: write) }
+; CHECK: attributes #2 = { cold noreturn nounwind memory(inaccessiblemem: write) }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
new file mode 100644
index 0000000000000..877fe5fe4b393
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-intrinsics.ll
@@ -0,0 +1,355 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B)
+; - llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A)
+; - llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float>, <16 x float>) #3
+
+define <8 x i64> @test_mm512_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_cvtne2ps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <32 x bfloat> [[TMP6]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast <32 x bfloat> %0 to <8 x i64>
+  ret <8 x i64> %1
+}
+
+define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(<16 x float> %A, <16 x float> %B, i32 %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_maskz_cvtne2ps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], <16 x float> [[B:%.*]], i32 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP2]] to <32 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32 [[U]] to <32 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <32 x i1> [[TMP9]], <32 x i16> zeroinitializer, <32 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <32 x bfloat> [[TMP7]] to <32 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <32 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <32 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP8]], <32 x i16> [[TMP14]], <32 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <32 x i1> [[TMP9]], <32 x bfloat> [[TMP7]], <32 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x bfloat> [[TMP15]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast i32 %U to <32 x i1>
+  %2 = select <32 x i1> %1, <32 x bfloat> %0, <32 x bfloat> zeroinitializer
+  %3 = bitcast <32 x bfloat> %2 to <8 x i64>
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(<8 x i64> %C, i32 %U, <16 x float> %A, <16 x float> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x i64> @test_mm512_mask_cvtne2ps2bf16_512(
+; CHECK-SAME: <8 x i64> [[C:%.*]], i32 [[U:%.*]], <16 x float> [[A:%.*]], <16 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> [[A]], <16 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i64> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x i64> [[C]] to <32 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32 [[TMP3]] to <32 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32 [[U]] to <32 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <32 x i1> [[TMP12]], <32 x i16> zeroinitializer, <32 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x bfloat> [[TMP8]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x bfloat> [[TMP10]] to <32 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <32 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <32 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <32 x i1> [[TMP11]], <32 x i16> [[TMP18]], <32 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <32 x i1> [[TMP12]], <32 x bfloat> [[TMP8]], <32 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <32 x i16> [[_MSPROP_SELECT]] to <8 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <32 x bfloat> [[TMP19]] to <8 x i64>
+; CHECK-NEXT:    store <8 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <32 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.512(<16 x float> %A, <16 x float> %B) #4
+  %1 = bitcast <8 x i64> %C to <32 x bfloat>
+  %2 = bitcast i32 %U to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x bfloat> %0, <32 x bfloat> %1
+  %4 = bitcast <32 x bfloat> %3 to <8 x i64>
+  ret <8 x i64> %4
+}
+
+declare <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float>) #3
+
+define <4 x i64> @test_mm512_cvtneps2bf16_512(<16 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_cvtneps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x bfloat> [[TMP4]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast <16 x bfloat> %0 to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(<16 x float> %A, i16 %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_maskz_cvtneps2bf16_512(
+; CHECK-SAME: <16 x float> [[A:%.*]], i16 [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP1]] to <16 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <16 x i1> [[TMP7]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <16 x bfloat> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <16 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP6]], <16 x i16> [[TMP12]], <16 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP7]], <16 x bfloat> [[TMP5]], <16 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP13]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP15]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  %3 = bitcast <16 x bfloat> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(<4 x i64> %C, i16 %U, <16 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm512_mask_cvtneps2bf16_512(
+; CHECK-SAME: <4 x i64> [[C:%.*]], i16 [[U:%.*]], <16 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> [[A]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i64> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <16 x i1> [[TMP10]], <16 x i16> zeroinitializer, <16 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <16 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i16> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i16> [[TMP15]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> [[TMP16]], <16 x i16> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP10]], <16 x bfloat> [[TMP6]], <16 x bfloat> [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <16 x bfloat> [[TMP17]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP19]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.512(<16 x float> %A) #4
+  %1 = bitcast <4 x i64> %C to <16 x bfloat>
+  %2 = bitcast i16 %U to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
+  %4 = bitcast <16 x bfloat> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+declare <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float>, <32 x bfloat>, <32 x bfloat>) #3
+
+define <16 x float> @test_mm512_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_dpbf16ps_512(
+; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  ret <16 x float> %0
+}
+
+define <16 x float> @test_mm512_maskz_dpbf16ps_512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B, i16 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_maskz_dpbf16ps_512(
+; CHECK-SAME: <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 192), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <16 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <16 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP16]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> zeroinitializer
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> zeroinitializer
+  ret <16 x float> %2
+}
+define <16 x float> @test_mm512_mask_dpbf16ps_512(i16 zeroext %U, <16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <16 x float> @test_mm512_mask_dpbf16ps_512(
+; CHECK-SAME: i16 zeroext [[U:%.*]], <16 x float> [[E:%.*]], <32 x bfloat> [[A:%.*]], <32 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP0]] to i512
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i512 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <32 x i16> [[TMP1]] to i512
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i512 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <32 x i16> [[TMP2]] to i512
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i512 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR4]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> [[E]], <32 x bfloat> [[A]], <32 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <16 x i1> [[TMP11]], <16 x i32> zeroinitializer, <16 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <16 x float> [[TMP9]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x float> [[E]] to <16 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP17]], <16 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP11]], <16 x float> [[TMP9]], <16 x float> [[E]]
+; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <16 x float> @llvm.x86.avx512bf16.dpbf16ps.512(<16 x float> %E, <32 x bfloat> %A, <32 x bfloat> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x float> %0, <16 x float> %E
+  ret <16 x float> %2
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
new file mode 100644
index 0000000000000..ac65645a9ec2c
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-mov.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-mov.ll
+;
+; Strictly handled: (none)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @funbf16(ptr readonly %src, ptr writeonly %dst) sanitize_memory {
+; CHECK-LABEL: define dso_local void @funbf16(
+; CHECK-SAME: ptr readonly [[SRC:%.*]], ptr writeonly [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080
+; CHECK-NEXT:    [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 1
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label %[[BB8:.*]], label %[[BB9:.*]], !prof [[PROF1]]
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB9]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = xor i64 [[TMP10]], 87960930222080
+; CHECK-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP11]] to ptr
+; CHECK-NEXT:    store <8 x i16> [[_MSLD]], ptr [[TMP12]], align 1
+; CHECK-NEXT:    store <8 x bfloat> [[TMP4]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP5]], label %[[BB13:.*]], label %[[BB14:.*]], !prof [[PROF1]]
+; CHECK:       [[BB13]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB14]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = load <8 x bfloat>, ptr [[SRC]], align 32
+; CHECK-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP17:%.*]] = xor i64 [[TMP16]], 87960930222080
+; CHECK-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP17]] to ptr
+; CHECK-NEXT:    [[_MSLD1:%.*]] = load <8 x i16>, ptr [[TMP18]], align 32
+; CHECK-NEXT:    [[_MSCMP6:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP6]], label %[[BB19:.*]], label %[[BB20:.*]], !prof [[PROF1]]
+; CHECK:       [[BB19]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB20]]:
+; CHECK-NEXT:    [[TMP21:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP22:%.*]] = xor i64 [[TMP21]], 87960930222080
+; CHECK-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP22]] to ptr
+; CHECK-NEXT:    store <8 x i16> [[_MSLD1]], ptr [[TMP23]], align 32
+; CHECK-NEXT:    store <8 x bfloat> [[TMP15]], ptr [[DST]], align 32
+; CHECK-NEXT:    [[_MSCMP7:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP7]], label %[[BB24:.*]], label %[[BB25:.*]], !prof [[PROF1]]
+; CHECK:       [[BB24]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB25]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[TMP27:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP28:%.*]] = xor i64 [[TMP27]], 87960930222080
+; CHECK-NEXT:    [[TMP29:%.*]] = inttoptr i64 [[TMP28]] to ptr
+; CHECK-NEXT:    [[_MSLD2:%.*]] = load <16 x i16>, ptr [[TMP29]], align 1
+; CHECK-NEXT:    [[_MSCMP8:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP8]], label %[[BB30:.*]], label %[[BB31:.*]], !prof [[PROF1]]
+; CHECK:       [[BB30]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB31]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP33:%.*]] = xor i64 [[TMP32]], 87960930222080
+; CHECK-NEXT:    [[TMP34:%.*]] = inttoptr i64 [[TMP33]] to ptr
+; CHECK-NEXT:    store <16 x i16> [[_MSLD2]], ptr [[TMP34]], align 1
+; CHECK-NEXT:    store <16 x bfloat> [[TMP26]], ptr [[DST]], align 1
+; CHECK-NEXT:    [[_MSCMP9:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP9]], label %[[BB35:.*]], label %[[BB36:.*]], !prof [[PROF1]]
+; CHECK:       [[BB35]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB36]]:
+; CHECK-NEXT:    [[TMP37:%.*]] = load <16 x bfloat>, ptr [[SRC]], align 32
+; CHECK-NEXT:    [[TMP38:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[TMP39:%.*]] = xor i64 [[TMP38]], 87960930222080
+; CHECK-NEXT:    [[TMP40:%.*]] = inttoptr i64 [[TMP39]] to ptr
+; CHECK-NEXT:    [[_MSLD3:%.*]] = load <16 x i16>, ptr [[TMP40]], align 32
+; CHECK-NEXT:    [[_MSCMP10:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP10]], label %[[BB41:.*]], label %[[BB42:.*]], !prof [[PROF1]]
+; CHECK:       [[BB41]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR3]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB42]]:
+; CHECK-NEXT:    [[TMP43:%.*]] = ptrtoint ptr [[DST]] to i64
+; CHECK-NEXT:    [[TMP44:%.*]] = xor i64 [[TMP43]], 87960930222080
+; CHECK-NEXT:    [[TMP45:%.*]] = inttoptr i64 [[TMP44]] to ptr
+; CHECK-NEXT:    store <16 x i16> [[_MSLD3]], ptr [[TMP45]], align 32
+; CHECK-NEXT:    store <16 x bfloat> [[TMP37]], ptr [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x bfloat>, ptr %src, align 1
+  store <8 x bfloat> %0, ptr %dst, align 1
+  %1 = load <8 x bfloat>, ptr %src, align 32
+  store <8 x bfloat> %1, ptr %dst, align 32
+  %2 = load <16 x bfloat>, ptr %src, align 1
+  store <16 x bfloat> %2, ptr %dst, align 1
+  %3 = load <16 x bfloat>, ptr %src, align 32
+  store <16 x bfloat> %3, ptr %dst, align 32
+  ret void
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
new file mode 100644
index 0000000000000..904614e961d6c
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bf16-vl-intrinsics.ll
@@ -0,0 +1,774 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=msan -mattr=+avx512bf16 -mattr=+avx512vl < %s | FileCheck %s
+;
+; Forked from llvm/test/CodeGen/X86/avx512bf16-vl-intrinsics.ll
+;
+; Strictly handled:
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B)
+; - llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B)
+; - llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A)
+; - llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B)
+; - llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B)
+; - llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %6, <4 x i1> %4)
+;
+; Heuristically handled: (none)
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float>, <4 x float>) #1
+
+define <2 x i64> @test_mm_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_cvtne2ps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(<4 x float> %A, <4 x float> %B, i8 zeroext %U) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_maskz_cvtne2ps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <8 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP8]], <8 x i16> [[TMP14]], <8 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <8 x i1> [[TMP9]], <8 x bfloat> [[TMP7]], <8 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <8 x bfloat> [[TMP15]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A, <4 x float> %B) local_unnamed_addr #0 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm_mask_cvtne2ps2bf16_128(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> [[A]], <4 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <2 x i64> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP12]], <8 x i16> zeroinitializer, <8 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP10]] to <8 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <8 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <8 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP11]], <8 x i16> [[TMP18]], <8 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <8 x i1> [[TMP12]], <8 x bfloat> [[TMP8]], <8 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <8 x bfloat> [[TMP19]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.128(<4 x float> %A, <4 x float> %B) #2
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = bitcast i8 %U to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float>, <8 x float>) #3
+
+define <4 x i64> @test_mm256_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_cvtne2ps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x bfloat> [[TMP6]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast <16 x bfloat> %0 to <4 x i64>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(<8 x float> %A, <8 x float> %B, i16 zeroext %U) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_maskz_cvtne2ps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], i16 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB5:.*]], label %[[BB6:.*]], !prof [[PROF1]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i16 [[TMP2]] to <16 x i1>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = select <16 x i1> [[TMP9]], <16 x i16> zeroinitializer, <16 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <16 x bfloat> [[TMP7]] to <16 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = xor <16 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i16> [[TMP12]], zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i16> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP8]], <16 x i16> [[TMP14]], <16 x i16> [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select <16 x i1> [[TMP9]], <16 x bfloat> [[TMP7]], <16 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <16 x bfloat> [[TMP15]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP16]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP17]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast i16 %U to <16 x i1>
+  %2 = select <16 x i1> %1, <16 x bfloat> %0, <16 x bfloat> zeroinitializer
+  %3 = bitcast <16 x bfloat> %2 to <4 x i64>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(<4 x i64> %C, i16 zeroext %U, <8 x float> %A, <8 x float> %B) local_unnamed_addr #1 sanitize_memory {
+; CHECK-LABEL: define <4 x i64> @test_mm256_mask_cvtne2ps2bf16_256(
+; CHECK-SAME: <4 x i64> [[C:%.*]], i16 zeroext [[U:%.*]], <8 x float> [[A:%.*]], <8 x float> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> [[A]], <8 x float> [[B]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <4 x i64> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i64> [[C]] to <16 x bfloat>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i16 [[TMP3]] to <16 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[U]] to <16 x i1>
+; CHECK-NEXT:    [[TMP13:%.*]] = select <16 x i1> [[TMP12]], <16 x i16> zeroinitializer, <16 x i16> [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x bfloat> [[TMP8]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x bfloat> [[TMP10]] to <16 x i16>
+; CHECK-NEXT:    [[TMP16:%.*]] = xor <16 x i16> [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i16> [[TMP16]], zeroinitializer
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i16> [[TMP17]], [[TMP9]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP11]], <16 x i16> [[TMP18]], <16 x i16> [[TMP13]]
+; CHECK-NEXT:    [[TMP19:%.*]] = select <16 x i1> [[TMP12]], <16 x bfloat> [[TMP8]], <16 x bfloat> [[TMP10]]
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast <16 x i16> [[_MSPROP_SELECT]] to <4 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = bitcast <16 x bfloat> [[TMP19]] to <4 x i64>
+; CHECK-NEXT:    store <4 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x i64> [[TMP21]]
+;
+entry:
+  %0 = tail call <16 x bfloat> @llvm.x86.avx512bf16.cvtne2ps2bf16.256(<8 x float> %A, <8 x float> %B) #4
+  %1 = bitcast <4 x i64> %C to <16 x bfloat>
+  %2 = bitcast i16 %U to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x bfloat> %0, <16 x bfloat> %1
+  %4 = bitcast <16 x bfloat> %3 to <4 x i64>
+  ret <4 x i64> %4
+}
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float>) #3
+
+define <2 x i64> @test_mm256_cvtneps2bf16_256(<8 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_cvtneps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB2:.*]], label %[[BB3:.*]], !prof [[PROF1]]
+; CHECK:       [[BB2]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x bfloat> [[TMP4]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP5]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(<8 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_maskz_cvtneps2bf16_256(
+; CHECK-SAME: <8 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB3:.*]], label %[[BB4:.*]], !prof [[PROF1]]
+; CHECK:       [[BB3]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8 [[TMP1]] to <8 x i1>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i16> zeroinitializer, <8 x i16> zeroinitializer
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = xor <8 x i16> [[TMP9]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i16> [[TMP10]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP6]], <8 x i16> [[TMP12]], <8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select <8 x i1> [[TMP7]], <8 x bfloat> [[TMP5]], <8 x bfloat> zeroinitializer
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP14]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP15]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x bfloat> %0, <8 x bfloat> zeroinitializer
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(<2 x i64> %C, i8 zeroext %U, <8 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm256_mask_cvtneps2bf16_256(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <8 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
+; CHECK:       [[BB4]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> [[A]])
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8 [[TMP2]] to <8 x i1>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = select <8 x i1> [[TMP10]], <8 x i16> zeroinitializer, <8 x i16> [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x bfloat> [[TMP8]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i16> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i16> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i16> [[TMP15]], [[TMP7]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP9]], <8 x i16> [[TMP16]], <8 x i16> [[TMP11]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP10]], <8 x bfloat> [[TMP6]], <8 x bfloat> [[TMP8]]
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <8 x bfloat> [[TMP17]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP18]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP19]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> %A) #4
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = bitcast i8 %U to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x bfloat> %0, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float>, <8 x bfloat>, <4 x i1>) #3
+
+define <2 x i64> @test_mm128_cvtneps2bf16_128(<4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %1 = bitcast <8 x bfloat> %0 to <2 x i64>
+  ret <2 x i64> %1
+}
+
+define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(<4 x float> %A, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_maskz_cvtneps2bf16_128(
+; CHECK-SAME: <4 x float> [[A:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i4 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    br i1 [[_MSOR]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> zeroinitializer, <4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP9]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP10]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> zeroinitializer, <4 x i1> %1) #4
+  %3 = bitcast <8 x bfloat> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
+define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_mask_cvtneps2bf16_128(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[_MSPROP:%.*]] = shufflevector <8 x i1> [[TMP3]], <8 x i1> splat (i1 true), <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <4 x i32> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP8]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x i16> [[TMP6]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP9]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i1> [[_MSPROP]] to i4
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i4 [[TMP10]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB11:.*]], label %[[BB12:.*]], !prof [[PROF1]]
+; CHECK:       [[BB11]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB12]]:
+; CHECK-NEXT:    [[TMP13:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> [[TMP7]], <4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x bfloat> [[TMP13]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP14]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = shufflevector <8 x i1> %0, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = bitcast <2 x i64> %C to <8 x bfloat>
+  %3 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> %2, <4 x i1> %1) #4
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+define <2 x i64> @test_mm128_cvtneps2bf16_128_select(<2 x i64> %C, i8 zeroext %U, <4 x float> %A) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <2 x i64> @test_mm128_cvtneps2bf16_128_select(
+; CHECK-SAME: <2 x i64> [[C:%.*]], i8 zeroext [[U:%.*]], <4 x float> [[A:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8 [[TMP0]] to <8 x i1>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <2 x i64> [[TMP1]] to <8 x i16>
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <2 x i64> [[C]] to <8 x bfloat>
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> [[A]], <8 x bfloat> poison, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP8:%.*]] = select <8 x i1> [[TMP4]], <8 x i16> zeroinitializer, <8 x i16> [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x bfloat> [[TMP7]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <8 x bfloat> [[TMP6]] to <8 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <8 x i16> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i16> [[TMP11]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i16> [[TMP12]], [[TMP5]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> [[TMP13]], <8 x i16> [[TMP8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP4]], <8 x bfloat> [[TMP7]], <8 x bfloat> [[TMP6]]
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[_MSPROP_SELECT]] to <2 x i64>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <8 x bfloat> [[TMP14]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <2 x i64> [[TMP16]]
+;
+entry:
+  %0 = bitcast i8 %U to <8 x i1>
+  %1 = bitcast <2 x i64> %C to <8 x bfloat>
+  %2 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> %A, <8 x bfloat> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) #4
+  %3 = select <8 x i1> %0, <8 x bfloat> %2, <8 x bfloat> %1
+  %4 = bitcast <8 x bfloat> %3 to <2 x i64>
+  ret <2 x i64> %4
+}
+
+declare <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float>, <16 x bfloat>, <16 x bfloat>) #3
+
+define <8 x float> @test_mm256_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_dpbf16ps_256(
+; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    store <8 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  ret <8 x float> %0
+}
+
+define <8 x float> @test_mm256_maskz_dpbf16ps_256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B, i8 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_maskz_dpbf16ps_256(
+; CHECK-SAME: <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]], i8 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 96), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <8 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP16]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> zeroinitializer
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
+  ret <8 x float> %2
+}
+define <8 x float> @test_mm256_mask_dpbf16ps_256(i8 zeroext %U, <8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <8 x float> @test_mm256_mask_dpbf16ps_256(
+; CHECK-SAME: i8 zeroext [[U:%.*]], <8 x float> [[E:%.*]], <16 x bfloat> [[A:%.*]], <16 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i8, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP0]] to i256
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i256 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i16> [[TMP1]] to i256
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i256 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i16> [[TMP2]] to i256
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i256 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> [[E]], <16 x bfloat> [[A]], <16 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 [[TMP3]] to <8 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8 [[U]] to <8 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <8 x i1> [[TMP11]], <8 x i32> zeroinitializer, <8 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <8 x float> [[TMP9]] to <8 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x float> [[E]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <8 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP10]], <8 x i32> [[TMP17]], <8 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP11]], <8 x float> [[TMP9]], <8 x float> [[E]]
+; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <8 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <8 x float> @llvm.x86.avx512bf16.dpbf16ps.256(<8 x float> %E, <16 x bfloat> %A, <16 x bfloat> %B) #4
+  %1 = bitcast i8 %U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %E
+  ret <8 x float> %2
+}
+
+declare <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float>, <8 x bfloat>, <8 x bfloat>) #3
+
+define <4 x float> @test_mm128_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_dpbf16ps_128(
+; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB6:.*]], label %[[BB7:.*]], !prof [[PROF1]]
+; CHECK:       [[BB6]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP8]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  ret <4 x float> %0
+}
+
+define <4 x float> @test_mm128_maskz_dpbf16ps_128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B, i4 zeroext %U) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_maskz_dpbf16ps_128(
+; CHECK-SAME: <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]], i4 zeroext [[U:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 48), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = xor <4 x i32> [[TMP13]], zeroinitializer
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP16]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP17:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> zeroinitializer
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP17]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  %1 = bitcast i4 %U to <4 x i1>
+  %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+define <4 x float> @test_mm128_mask_dpbf16ps_128(i4 zeroext %U, <4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) local_unnamed_addr #2 sanitize_memory {
+; CHECK-LABEL: define <4 x float> @test_mm128_mask_dpbf16ps_128(
+; CHECK-SAME: i4 zeroext [[U:%.*]], <4 x float> [[E:%.*]], <8 x bfloat> [[A:%.*]], <8 x bfloat> [[B:%.*]]) local_unnamed_addr #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load i4, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP0]] to i128
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i128 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i16> [[TMP1]] to i128
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i128 [[TMP5]], 0
+; CHECK-NEXT:    [[_MSOR:%.*]] = or i1 [[_MSCMP]], [[_MSCMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i16> [[TMP2]] to i128
+; CHECK-NEXT:    [[_MSCMP2:%.*]] = icmp ne i128 [[TMP6]], 0
+; CHECK-NEXT:    [[_MSOR3:%.*]] = or i1 [[_MSOR]], [[_MSCMP2]]
+; CHECK-NEXT:    br i1 [[_MSOR3]], label %[[BB7:.*]], label %[[BB8:.*]], !prof [[PROF1]]
+; CHECK:       [[BB7]]:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    unreachable
+; CHECK:       [[BB8]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> [[E]], <8 x bfloat> [[A]], <8 x bfloat> [[B]])
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i4 [[TMP3]] to <4 x i1>
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i4 [[U]] to <4 x i1>
+; CHECK-NEXT:    [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> zeroinitializer, <4 x i32> [[TMP0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast <4 x float> [[TMP9]] to <4 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <4 x float> [[E]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP0]]
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP17]], <4 x i32> [[TMP12]]
+; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[TMP11]], <4 x float> [[TMP9]], <4 x float> [[E]]
+; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <4 x float> [[TMP18]]
+;
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx512bf16.dpbf16ps.128(<4 x float> %E, <8 x bfloat> %A, <8 x bfloat> %B) #4
+  %1 = bitcast i4 %U to <4 x i1>
+  %2 = select <4 x i1> %1, <4 x float> %0, <4 x float> %E
+  ret <4 x float> %2
+}
+
+define <16 x i16> @test_no_vbroadcast1() sanitize_memory {
+; CHECK-LABEL: define <16 x i16> @test_no_vbroadcast1(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x bfloat> [[TMP0]] to <8 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i16> [[TMP2]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = bitcast <8 x bfloat> %0 to <8 x i16>
+  %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <16 x i32> zeroinitializer
+  ret <16 x i16> %2
+}
+
+define <16 x bfloat> @test_no_vbroadcast2() nounwind sanitize_memory {
+; CHECK-LABEL: define <16 x bfloat> @test_no_vbroadcast2(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x bfloat> [[TMP0]], <8 x bfloat> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x bfloat> [[TMP1]]
+;
+entry:
+  %0 = tail call <8 x bfloat> @llvm.x86.avx512bf16.mask.cvtneps2bf16.128(<4 x float> poison, <8 x bfloat> zeroinitializer, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = shufflevector <8 x bfloat> %0, <8 x bfloat> poison, <16 x i32> zeroinitializer
+  ret <16 x bfloat> %1
+}
+
+define <16 x i32> @pr83358() sanitize_memory {
+; CHECK-LABEL: define <16 x i32> @pr83358(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x bfloat> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    store <16 x i32> zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret <16 x i32> [[TMP3]]
+;
+  %1 = call <8 x bfloat> @llvm.x86.avx512bf16.cvtneps2bf16.256(<8 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00, float 8.000000e+00>)
+  %2 = bitcast <8 x bfloat> %1 to <4 x i32>
+  %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <16 x i32> %3
+}
+;.
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
+;.
diff --git a/llvm/test/Linker/drop-attribute.ll b/llvm/test/Linker/drop-attribute.ll
index 9be95a89109b4..3d4c13c2ffc75 100644
--- a/llvm/test/Linker/drop-attribute.ll
+++ b/llvm/test/Linker/drop-attribute.ll
@@ -39,7 +39,7 @@ define void @test_nocallback_definition() nocallback {
 declare void @test_nocallback_call_site()
 
 ; Test that checks that nocallback attribute on an intrinsic is NOT dropped.
-; CHECK: ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn
+; CHECK: ; Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn
 ; CHECK-NEXT: declare float @llvm.sqrt.f32(float) #0
 declare float @llvm.sqrt.f32(float) nocallback
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
index f5cb4b72959f9..2661ed5b04cc9 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc-encoding-ISAFuture.txt
@@ -82,12 +82,18 @@
 #CHECK: lxvprll 6, 2, 1
 0x7c 0xc2 0x0c 0xda
 
+#CHECK: lxvpb32x 2, 15, 16
+0x7c,0x4f,0x86,0xda
+
 #CHECK: stxvprl 0, 1, 2
 0x7c 0x01 0x15 0x9a
 
 #CHECK: stxvprll 6, 0, 1
 0x7c 0xc0 0x0d 0xda
 
+#CHECK: stxvpb32x 2, 15, 16
+0x7c,0x4f,0x87,0xda
+
 #CHECK: dmxvi8gerx4 1, 2, 4
 0xec,0x82,0x20,0x58
 
diff --git a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
index f0df8ce39021b..7fb8254ced0ac 100644
--- a/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
+++ b/llvm/test/MC/Disassembler/PowerPC/ppc64le-encoding-ISAFuture.txt
@@ -76,12 +76,18 @@
 #CHECK: lxvprll 6, 2, 1
 0xda 0x0c 0xc2 0x7c
 
+#CHECK: lxvpb32x 2, 15, 16
+0xda,0x86,0x4f,0x7c
+
 #CHECK: stxvprl 0, 1, 2
 0x9a 0x15 0x01 0x7c
 
 #CHECK: stxvprll 6, 0, 1
 0xda 0x0d 0xc0 0x7c
 
+#CHECK: stxvpb32x 2, 15, 16
+0xda,0x87,0x4f,0x7c
+
 #CHECK: dmxvi8gerx4 1, 2, 4
 0x58,0x20,0x82,0xec
 
diff --git a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
index bc0683e38887c..40059c440b128 100644
--- a/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
+++ b/llvm/test/MC/PowerPC/ppc-encoding-ISAFuture.s
@@ -105,6 +105,10 @@
 # CHECK-LE: lxvprll 6, 2, 1               # encoding: [0xda,0x0c,0xc2,0x7c]
             lxvprll 6, 2, 1
 
+           lxvpb32x 2, 15, 16
+#CHECK-BE: lxvpb32x 2, 15, 16            # encoding: [0x7c,0x4f,0x86,0xda]
+#CHECK-LE: lxvpb32x 2, 15, 16            # encoding: [0xda,0x86,0x4f,0x7c]
+
 # CHECK-BE: stxvprl 0, 1, 2               # encoding: [0x7c,0x01,0x15,0x9a]
 # CHECK-LE: stxvprl 0, 1, 2               # encoding: [0x9a,0x15,0x01,0x7c]
             stxvprl 0, 1, 2
@@ -113,6 +117,10 @@
 # CHECK-LE: stxvprll 6, 0, 1              # encoding: [0xda,0x0d,0xc0,0x7c]
             stxvprll 6, 0, 1
 
+           stxvpb32x 2, 15, 16
+#CHECK-BE: stxvpb32x 2, 15, 16            # encoding: [0x7c,0x4f,0x87,0xda]
+#CHECK-LE: stxvpb32x 2, 15, 16            # encoding: [0xda,0x87,0x4f,0x7c]
+
             dmxvi8gerx4 1, 2, 4
 # CHECK-BE: dmxvi8gerx4 1, 2, 4                     # encoding: [0xec,0x82,0x20,0x58]
 # CHECK-LE: dmxvi8gerx4 1, 2, 4                     # encoding: [0x58,0x20,0x82,0xec]
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 65b96c8b8ef5d..62975a3cf8ac4 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -208,6 +208,7 @@
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 3a0fffe426da1..012a1ab5802b5 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -133,6 +133,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 4623edcaf6656..e021ff3124b60 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -118,6 +118,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 590afd925e841..20f94bc2e0f6c 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -127,6 +127,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index dd6acd2c51ee7..b61edc805108d 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -165,6 +165,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index ee054527e20bd..acf8c053d0f82 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -167,6 +167,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index fd95e94f3c8b9..6b3c5ca7605de 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -131,6 +131,7 @@
 ; CHECK-O-NEXT: Running pass: BDCEPass
 ; CHECK-O-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O23SZ-NEXT: Running pass: DFAJumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running pass: JumpThreadingPass
 ; CHECK-O23SZ-NEXT: Running analysis: LazyValueAnalysis
 ; CHECK-O23SZ-NEXT: Running pass: CorrelatedValuePropagationPass
diff --git a/llvm/test/TableGen/directive1.td b/llvm/test/TableGen/directive1.td
index 475faf9254157..8648651f3d714 100644
--- a/llvm/test/TableGen/directive1.td
+++ b/llvm/test/TableGen/directive1.td
@@ -61,6 +61,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #include <utility>
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
+// CHECK-EMPTY:
 // CHECK-NEXT:  namespace tdl {
 // CHECK-EMPTY:
 // CHECK-NEXT:  LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
@@ -176,6 +177,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
 // CHECK-NEXT:    static constexpr bool is_iterable = true;
 // CHECK-NEXT:  };
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-EMPTY:
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
diff --git a/llvm/test/TableGen/directive2.td b/llvm/test/TableGen/directive2.td
index ccc09446b4465..96022d7647440 100644
--- a/llvm/test/TableGen/directive2.td
+++ b/llvm/test/TableGen/directive2.td
@@ -54,6 +54,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  #include <utility>
 // CHECK-EMPTY:
 // CHECK-NEXT:  namespace llvm {
+// CHECK-EMPTY:
 // CHECK-NEXT:  namespace tdl {
 // CHECK-EMPTY:
 // CHECK-NEXT:  enum class Association {
@@ -132,6 +133,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  LLVM_ABI Association getDirectiveAssociation(Directive D);
 // CHECK-NEXT:  LLVM_ABI Category getDirectiveCategory(Directive D);
 // CHECK-NEXT:  LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace tdl
 // CHECK-EMPTY:
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Association> {
@@ -149,6 +151,7 @@ def TDL_DirA : Directive<[Spelling<"dira">]> {
 // CHECK-NEXT:  template <> struct enum_iteration_traits<tdl::Clause> {
 // CHECK-NEXT:    static constexpr bool is_iterable = true;
 // CHECK-NEXT:  };
+// CHECK-EMPTY:
 // CHECK-NEXT:  } // namespace llvm
 // CHECK-EMPTY:
 // CHECK-NEXT:  #endif // LLVM_Tdl_INC
diff --git a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
index 95a52aa0f7f52..b509b2469cfdc 100644
--- a/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
+++ b/llvm/test/Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -passes=atomic-expand %s | FileCheck %s
 ; RUN: opt -codegen-opt-level=1 -S -mtriple=aarch64-- -mattr=+outline-atomics -passes=atomic-expand %s | FileCheck %s --check-prefix=OUTLINE-ATOMICS
 
-define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
+define void @atomic_swap_f16(ptr %ptr, half %val) !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f16(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half [[VAL:%.*]] to i16
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -12,7 +12,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i16) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i16 [[TMP3]] to half
 ; CHECK-NEXT:    ret void
@@ -27,7 +27,7 @@ define void @atomic_swap_f16(ptr %ptr, half %val) nounwind {
   ret void
 }
 
-define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
+define void @atomic_swap_f32(ptr %ptr, float %val) nounwind !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float [[VAL:%.*]] to i32
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -37,7 +37,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP4]], ptr elementtype(i32) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP3]] to float
 ; CHECK-NEXT:    ret void
@@ -52,7 +52,7 @@ define void @atomic_swap_f32(ptr %ptr, float %val) nounwind {
   ret void
 }
 
-define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
+define void @atomic_swap_f64(ptr %ptr, double %val) nounwind !prof !0 {
 ; CHECK-LABEL: @atomic_swap_f64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double [[VAL:%.*]] to i64
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
@@ -60,7 +60,7 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.aarch64.ldaxr.p0(ptr elementtype(i64) [[PTR:%.*]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.aarch64.stxr.p0(i64 [[TMP1]], ptr elementtype(i64) [[PTR]])
 ; CHECK-NEXT:    [[TRYAGAIN:%.*]] = icmp ne i32 [[TMP3]], 0
-; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]]
+; CHECK-NEXT:    br i1 [[TRYAGAIN]], label [[ATOMICRMW_START]], label [[ATOMICRMW_END:%.*]], !prof [[PROF1]]
 ; CHECK:       atomicrmw.end:
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i64 [[TMP2]] to double
 ; CHECK-NEXT:    ret void
@@ -74,3 +74,17 @@ define void @atomic_swap_f64(ptr %ptr, double %val) nounwind {
   %t1 = atomicrmw xchg ptr %ptr, double %val acquire
   ret void
 }
+
+!0 = !{!"function_entry_count", i64 1000}
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nofree nounwind willreturn }
+;.
+; OUTLINE-ATOMICS: attributes #[[ATTR0:[0-9]+]] = { "target-features"="+outline-atomics" }
+; OUTLINE-ATOMICS: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-features"="+outline-atomics" }
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"unknown", !"atomic-expand"}
+;.
+; OUTLINE-ATOMICS: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
+;.
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
index 649e9467c0318..fffe50fde1e50 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/X86/min-legal-vector-width.ll
@@ -9,15 +9,25 @@ target triple = "x86_64-unknown-linux-gnu"
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer512
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0:![0-9]+]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -66,15 +76,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -123,15 +143,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer512_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR1]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -180,15 +210,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512(ptr %arg, ptr readonly %arg1) #0 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal512_prefer512
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR0]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -237,13 +277,21 @@ bb:
 ; This should not promote
 define internal fastcc void @callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(ptr %arg, ptr readonly %arg1) #1 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR1]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -290,13 +338,21 @@ bb:
 ; This should not promote
 define internal fastcc void @callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(ptr %arg, ptr readonly %arg1) #2 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], ptr noalias nofree noundef nonnull readonly align 64 captures(none) dereferenceable(64) [[ARG1:%.*]]) #[[ATTR2]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -343,15 +399,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256(ptr %arg, ptr readonly %arg1) #3 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal256_prefer256_call_avx2_legal512_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -400,15 +466,25 @@ bb:
 ; This should promote
 define internal fastcc void @callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256(ptr %arg, ptr readonly %arg1) #4 {
 ;
-; CHECK: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
-; CHECK-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
-; CHECK-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
-; CHECK-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
-; CHECK-NEXT:    ret void
+; TUNIT: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; TUNIT-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
+; TUNIT-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
+; TUNIT-NEXT:  bb:
+; TUNIT-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64
+; TUNIT-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; TUNIT-NEXT:    ret void
+;
+; CGSCC: Function Attrs: inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable
+; CGSCC-LABEL: define {{[^@]+}}@callee_avx2_legal512_prefer256_call_avx2_legal256_prefer256
+; CGSCC-SAME: (ptr noalias nofree noundef nonnull writeonly align 64 captures(none) dereferenceable(64) [[ARG:%.*]], <8 x i64> [[TMP0:%.*]]) #[[ATTR3]] {
+; CGSCC-NEXT:  bb:
+; CGSCC-NEXT:    [[ARG1_PRIV:%.*]] = alloca <8 x i64>, align 64
+; CGSCC-NEXT:    store <8 x i64> [[TMP0]], ptr [[ARG1_PRIV]], align 64
+; CGSCC-NEXT:    [[TMP:%.*]] = load <8 x i64>, ptr [[ARG1_PRIV]], align 64, !invariant.load [[META0]]
+; CGSCC-NEXT:    store <8 x i64> [[TMP]], ptr [[ARG]], align 64
+; CGSCC-NEXT:    ret void
 ;
 bb:
   %tmp = load <8 x i64>, ptr %arg1
@@ -464,6 +540,14 @@ attributes #3 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2
 attributes #4 = { inlinehint norecurse nounwind uwtable "target-features"="+avx2" "min-legal-vector-width"="256" "prefer-vector-width"="256" }
 attributes #5 = { argmemonly nounwind }
 ;.
+; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
+; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" }
+; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
+; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
+; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn }
+;.
 ; TUNIT: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
 ; TUNIT: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
 ; TUNIT: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
@@ -472,11 +556,7 @@ attributes #5 = { argmemonly nounwind }
 ; TUNIT: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
 ; TUNIT: attributes #[[ATTR6]] = { nofree nosync nounwind willreturn }
 ;.
-; CGSCC: attributes #[[ATTR0]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="512" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR1]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR2]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="256" "prefer-vector-width"="256" "target-features"="+avx512vl" }
-; CGSCC: attributes #[[ATTR3]] = { inlinehint mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) uwtable "min-legal-vector-width"="512" "prefer-vector-width"="256" "target-features"="+avx2" }
-; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CGSCC: attributes #[[ATTR5]] = { nofree willreturn memory(write) }
-; CGSCC: attributes #[[ATTR6]] = { nofree nounwind willreturn }
+; CGSCC: [[META0]] = !{}
 ;.
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/Attributor/align-ptrmask.ll b/llvm/test/Transforms/Attributor/align-ptrmask.ll
new file mode 100644
index 0000000000000..008f5e1b8a46e
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/align-ptrmask.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=attributor -S < %s | FileCheck %s
+
+define ptr @align_ptrmask_back_no_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_no_prop(
+; CHECK-SAME: ptr nofree writeonly align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_prop(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 16 dereferenceable(4) ptr @align_ptrmask_back_prop(
+; CHECK-SAME: ptr nofree writeonly align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 16 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 16
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 16
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask(ptr align 2 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_mask(
+; CHECK-SAME: ptr nofree readnone align 2 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_ptr(ptr align 16 %x, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_forward_ptr(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 -8
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 -8
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_nonconst_mask(ptr align 8 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_forward_nonconst_mask(
+; CHECK-SAME: ptr nofree readnone align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 %y
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_nonconst_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_nonconst_mask(
+; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP1]], i64 -32, i64 [[Y]]
+; CHECK-NEXT:    [[SEL1:%.*]] = select i1 [[CMP2]], i64 [[SEL]], i64 -16
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 [[SEL1]]) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %sel = select i1 %cmp1, i64 -32, i64 %y
+  %sel1 = select i1 %cmp2, i64 %sel, i64 -16
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 %sel1)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_back_noprop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_noprop(
+; CHECK-SAME: ptr nofree writeonly align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_back_prop(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define noundef nonnull align 8 dereferenceable(4) ptr @align_ptrmask_back_const_back_prop(
+; CHECK-SAME: ptr nofree writeonly align 8 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call noundef nonnull align 8 dereferenceable(4) ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -2) #[[ATTR4]]
+; CHECK-NEXT:    store float 1.000000e+00, ptr [[P]], align 8
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -2)
+  store float 1.0, ptr %p, align 8
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_forward_mask(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 8 ptr @align_ptrmask_back_const_forward_mask(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_const_forward_ptr(ptr align 16 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_const_forward_ptr(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8)
+  ret ptr %p
+}
+
+; FIXME: The store will create AAAlign for %ptr1,
+; but the attribute didn't propagate through extractelement, need propagate
+define <2 x ptr> @ptrmask_v2p0_v2i64(<2 x ptr> align 2 %ptr, i64 %a) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_v2p0_v2i64(
+; CHECK-SAME: <2 x ptr> align 2 [[PTR:%.*]], i64 [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[RESULT:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PTR]], <2 x i64> noundef splat (i64 -8)) #[[ATTR4]]
+; CHECK-NEXT:    [[PTR1:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 0
+; CHECK-NEXT:    [[PTR2:%.*]] = extractelement <2 x ptr> [[RESULT]], i32 1
+; CHECK-NEXT:    store i64 [[A]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    store i64 [[A]], ptr [[PTR2]], align 16
+; CHECK-NEXT:    ret <2 x ptr> [[RESULT]]
+;
+  %result = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %ptr, <2 x i64> splat(i64 -8))
+  %ptr1 = extractelement <2 x ptr> %result, i32 0
+  %ptr2 = extractelement <2 x ptr> %result, i32 1
+  store i64 %a, ptr %ptr1, align 16
+  store i64 %a, ptr %ptr2, align 16
+  ret <2 x ptr> %result
+}
+
+define ptr @align_ptrmask_forward_mask_positive(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_positive(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef 2) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 2)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_poison(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4 ptr @align_ptrmask_forward_mask_poison(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 poison) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 poison)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_max(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4294967296) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4294967296)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_forward_mask_max_plus_one(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 4294967296 ptr @align_ptrmask_forward_mask_max_plus_one(
+; CHECK-SAME: ptr nofree readnone align 4 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -8589934592) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -8589934592)
+  ret ptr %p
+}
+
+define ptr @align_ptrmask_back_callsite(ptr align 4 %x, i64 %y, i1 %cmp1, i1 %cmp2) {
+; CHECK-LABEL: define align 16 ptr @align_ptrmask_back_callsite(
+; CHECK-SAME: ptr nofree readnone align 16 [[X:%.*]], i64 [[Y:%.*]], i1 [[CMP1:%.*]], i1 [[CMP2:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[P:%.*]] = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr [[X]], i64 noundef -4) #[[ATTR4]]
+; CHECK-NEXT:    ret ptr [[P]]
+;
+  %p = tail call align 16 ptr @llvm.ptrmask.p0.i64(ptr %x, i64 -4)
+  ret ptr %p
+}
diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll
index 2a9d5d91ae053..94aa79aa327f4 100644
--- a/llvm/test/Transforms/Attributor/nofree.ll
+++ b/llvm/test/Transforms/Attributor/nofree.ll
@@ -238,7 +238,7 @@ define void @call_both() #0 {
 
 ; TEST 10 (positive case)
 ; Call intrinsic function
-; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
@@ -489,7 +489,7 @@ attributes #2 = { nobuiltin nounwind }
 ; TUNIT: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR4]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
 ; TUNIT: attributes #[[ATTR5:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable }
-; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; TUNIT: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR7]] = { nofree nounwind }
 ; TUNIT: attributes #[[ATTR8]] = { nobuiltin nofree nounwind }
 ; TUNIT: attributes #[[ATTR9]] = { nosync memory(none) }
@@ -506,7 +506,7 @@ attributes #2 = { nobuiltin nounwind }
 ; CGSCC: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR4:[0-9]+]] = { nofree noinline nounwind memory(none) uwtable }
 ; CGSCC: attributes #[[ATTR5]] = { mustprogress nofree noinline nosync nounwind willreturn memory(none) uwtable }
-; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CGSCC: attributes #[[ATTR6:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR7]] = { nofree nounwind }
 ; CGSCC: attributes #[[ATTR8]] = { nobuiltin nofree nounwind }
 ; CGSCC: attributes #[[ATTR9]] = { nosync memory(none) }
diff --git a/llvm/test/Transforms/Attributor/nosync.ll b/llvm/test/Transforms/Attributor/nosync.ll
index 7ef46e8e94c9e..c15bd775ddb4d 100644
--- a/llvm/test/Transforms/Attributor/nosync.ll
+++ b/llvm/test/Transforms/Attributor/nosync.ll
@@ -454,7 +454,7 @@ define void @nosync_convergent_callee_test() {
 ; CHECK: attributes #[[ATTR14:[0-9]+]] = { convergent memory(none) }
 ; CHECK: attributes #[[ATTR15]] = { memory(none) }
 ; CHECK: attributes #[[ATTR16]] = { nounwind }
-; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR17:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR18]] = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR19]] = { nosync memory(none) }
 ; CHECK: attributes #[[ATTR20]] = { nofree nounwind }
diff --git a/llvm/test/Transforms/Attributor/willreturn.ll b/llvm/test/Transforms/Attributor/willreturn.ll
index b7ac7fc2970b0..d65480b05759a 100644
--- a/llvm/test/Transforms/Attributor/willreturn.ll
+++ b/llvm/test/Transforms/Attributor/willreturn.ll
@@ -276,7 +276,7 @@ define void @conditional_exit(i32 %0, ptr nocapture readonly %1) local_unnamed_a
 
 ; TEST 6 (positive case)
 ; Call intrinsic function
-; CHECK: Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+; CHECK: Function Attrs: nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none)
 declare float @llvm.floor.f32(float)
 
 define void @call_floor(float %a) #0 {
@@ -425,7 +425,7 @@ define i32 @loop_constant_trip_count(ptr nocapture readonly %0) #0 {
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i64 [ 0, [[TMP1:%.*]] ], [ [[TMP9:%.*]], [[TMP3]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ 0, [[TMP1]] ], [ [[TMP8]], [[TMP3]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !invariant.load [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP8]] = add nsw i32 [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9]] = add nuw nsw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 10
@@ -472,7 +472,7 @@ define i32 @loop_trip_count_unbound(i32 %0, i32 %1, ptr nocapture readonly %2, i
 ; CHECK-NEXT:    [[TMP10:%.*]] = phi i32 [ [[TMP14]], [[TMP8]] ], [ 0, [[TMP4]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !invariant.load [[META0]]
 ; CHECK-NEXT:    [[TMP14]] = add nsw i32 [[TMP13]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP15]] = add i32 [[TMP9]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[TMP15]], [[TMP1]]
@@ -522,7 +522,7 @@ define i32 @loop_trip_dec(i32 %0, ptr nocapture readonly %1) local_unnamed_addr
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ [[TMP5]], [[TMP4]] ], [ [[TMP12:%.*]], [[TMP6]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = phi i32 [ 0, [[TMP4]] ], [ [[TMP11:%.*]], [[TMP6]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !invariant.load [[META0]]
 ; CHECK-NEXT:    [[TMP11]] = add nsw i32 [[TMP10]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP12]] = add nsw i64 [[TMP7]], -1
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt i64 [[TMP7]], 0
@@ -1294,7 +1294,7 @@ attributes #1 = { uwtable noinline }
 ; TUNIT: attributes #[[ATTR5]] = { noreturn }
 ; TUNIT: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable }
 ; TUNIT: attributes #[[ATTR7]] = { noinline nounwind uwtable }
-; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; TUNIT: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; TUNIT: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn }
 ; TUNIT: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable }
 ; TUNIT: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable }
@@ -1332,7 +1332,7 @@ attributes #1 = { uwtable noinline }
 ; CGSCC: attributes #[[ATTR5]] = { noreturn }
 ; CGSCC: attributes #[[ATTR6]] = { noinline noreturn nounwind uwtable }
 ; CGSCC: attributes #[[ATTR7]] = { noinline nounwind uwtable }
-; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CGSCC: attributes #[[ATTR8:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CGSCC: attributes #[[ATTR9:[0-9]+]] = { norecurse willreturn }
 ; CGSCC: attributes #[[ATTR10]] = { mustprogress noinline nounwind willreturn uwtable }
 ; CGSCC: attributes #[[ATTR11:[0-9]+]] = { noinline willreturn uwtable }
@@ -1364,3 +1364,7 @@ attributes #1 = { uwtable noinline }
 ; CGSCC: attributes #[[ATTR37]] = { nosync willreturn memory(read) }
 ; CGSCC: attributes #[[ATTR38]] = { willreturn memory(read) }
 ;.
+; TUNIT: [[META0]] = !{}
+;.
+; CGSCC: [[META0]] = !{}
+;.
diff --git a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
index cf871e5714bf5..1dbffd962a638 100644
--- a/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
+++ b/llvm/test/Transforms/EarlyCSE/replace-calls-def-attrs.ll
@@ -262,7 +262,7 @@ define i32 @commutative_intrinsic_intersection_failure(i32 %arg, i32 %arg1) {
 }
 
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR1]] = { memory(none) }
 ; CHECK: attributes #[[ATTR2]] = { memory(read) }
 ; CHECK: attributes #[[ATTR3]] = { alwaysinline memory(none) }
diff --git a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
index 9e049837b0352..f0d4ad74fbe05 100644
--- a/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
+++ b/llvm/test/Transforms/InstCombine/binop-phi-operands.ll
@@ -653,12 +653,11 @@ define i8 @mul_const_incoming0_speculatable(i1 %b, i8 %x, i8 %y) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[B:%.*]], label [[IF:%.*]], label [[THEN:%.*]]
 ; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = mul i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br label [[THEN]]
 ; CHECK:       then:
-; CHECK-NEXT:    [[P0:%.*]] = phi i8 [ 42, [[ENTRY:%.*]] ], [ [[X:%.*]], [[IF]] ]
-; CHECK-NEXT:    [[P1:%.*]] = phi i8 [ 17, [[ENTRY]] ], [ [[Y:%.*]], [[IF]] ]
+; CHECK-NEXT:    [[R:%.*]] = phi i8 [ -54, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF]] ]
 ; CHECK-NEXT:    call void @sideeffect()
-; CHECK-NEXT:    [[R:%.*]] = mul i8 [[P0]], [[P1]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
 entry:
diff --git a/llvm/test/Transforms/InstCombine/binop-select.ll b/llvm/test/Transforms/InstCombine/binop-select.ll
index 25f624ee13412..fe1ec9014f188 100644
--- a/llvm/test/Transforms/InstCombine/binop-select.ll
+++ b/llvm/test/Transforms/InstCombine/binop-select.ll
@@ -335,7 +335,7 @@ define i32 @sub_sel_op1_use(i1 %b) {
 
 define float @fadd_sel_op0(i1 %b, float %x) {
 ; CHECK-LABEL: @fadd_sel_op0(
-; CHECK-NEXT:    [[R:%.*]] = select nnan i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[B:%.*]], float 0xFFF0000000000000, float 0x7FF0000000000000
 ; CHECK-NEXT:    ret float [[R]]
 ;
   %s = select i1 %b, float 0xFFF0000000000000, float 0x7FF0000000000000
@@ -403,3 +403,188 @@ define i32 @ashr_sel_op1_use(i1 %b) {
   %r = ashr i32 -2, %s
   ret i32 %r
 }
+
+define i8 @commonArgWithOr0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr0(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 0, i8 8
+; CHECK-NEXT:    [[V2:%.*]] = or disjoint i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithOr1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr1(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 17, i8 23
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithOr2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithOr2(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 5, i8 42
+; CHECK-NEXT:    [[V2:%.*]] = or i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 42
+  %v2 = or i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd0(
+; CHECK-NEXT:    ret i8 16
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd1(
+; CHECK-NEXT:    ret i8 16
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 8, i8 1
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd2(
+; CHECK-NEXT:    [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAnd3(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAnd3(
+; CHECK-NEXT:    [[V2:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 42
+  %v2 = and i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor0(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 0, i8 8
+; CHECK-NEXT:    [[V2:%.*]] = or disjoint i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 0, i8 8
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor1(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor1(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 9, i8 1
+; CHECK-NEXT:    [[V2:%.*]] = xor i8 [[V1]], [[V0]]
+; CHECK-NEXT:    ret i8 [[V2]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 9, i8 1
+  %v2 = xor i8 %v1, %v0
+  ret i8 %v2
+}
+
+define i8 @commonArgWithXor2(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor2(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 1, i8 7
+; CHECK-NEXT:    [[V2:%.*]] = xor i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 1, i8 7
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithXor3(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithXor3(
+; CHECK-NEXT:    [[V0:%.*]] = zext i1 [[ARG0:%.*]] to i8
+; CHECK-NEXT:    [[V1:%.*]] = select i1 [[ARG0]], i8 5, i8 45
+; CHECK-NEXT:    [[V2:%.*]] = xor i8 [[V1]], [[V0]]
+; CHECK-NEXT:    [[V3:%.*]] = or disjoint i8 [[V2]], 16
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 45
+  %v2 = xor i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i8 @commonArgWithAdd0(i1 %arg0) {
+; CHECK-LABEL: @commonArgWithAdd0(
+; CHECK-NEXT:    [[V3:%.*]] = select i1 [[ARG0:%.*]], i8 22, i8 61
+; CHECK-NEXT:    ret i8 [[V3]]
+;
+  %v0 = zext i1 %arg0 to i8
+  %v1 = select i1 %arg0, i8 21, i8 45
+  %v2 = add i8 %v1, %v0
+  %v3 = or i8 %v2, 16
+  ret i8 %v3
+}
+
+define i32 @OrSelectIcmpZero(i32 %a, i32 %b) {
+; CHECK-LABEL: @OrSelectIcmpZero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+define i32 @OrSelectIcmpNonZero(i32 %a, i32 %b) {
+; CHECK-LABEL: @OrSelectIcmpNonZero(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 42
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 42
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
diff --git a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
index 45e47d8e781be..5e90d4b8d4419 100644
--- a/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
+++ b/llvm/test/Transforms/InstCombine/dont-distribute-phi.ll
@@ -7,7 +7,7 @@
 define zeroext i1 @foo(i32 %arg) {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]]
 ; CHECK:       bb_then:
 ; CHECK-NEXT:    call void @bar()
@@ -16,8 +16,7 @@ define zeroext i1 @foo(i32 %arg) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17
 ; CHECK-NEXT:    br label [[BB_EXIT]]
 ; CHECK:       bb_exit:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ]
-; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]]
+; CHECK-NEXT:    [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ]
 ; CHECK-NEXT:    ret i1 [[AND1]]
 ;
 
@@ -43,7 +42,7 @@ bb_exit:
 define zeroext i1 @foo_logical(i32 %arg) {
 ; CHECK-LABEL: @foo_logical(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[ARG:%.*]], 37
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[ARG:%.*]], 37
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[BB_ELSE:%.*]], label [[BB_THEN:%.*]]
 ; CHECK:       bb_then:
 ; CHECK-NEXT:    call void @bar()
@@ -52,8 +51,7 @@ define zeroext i1 @foo_logical(i32 %arg) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[ARG]], 17
 ; CHECK-NEXT:    br label [[BB_EXIT]]
 ; CHECK:       bb_exit:
-; CHECK-NEXT:    [[PHI1:%.*]] = phi i1 [ [[CMP2]], [[BB_ELSE]] ], [ undef, [[BB_THEN]] ]
-; CHECK-NEXT:    [[AND1:%.*]] = and i1 [[PHI1]], [[CMP1]]
+; CHECK-NEXT:    [[AND1:%.*]] = phi i1 [ [[CMP2]], [[BB_THEN]] ], [ false, [[BB_ELSE]] ]
 ; CHECK-NEXT:    ret i1 [[AND1]]
 ;
 
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index cd4a8e36c6e23..3cbf7090a13b8 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -1222,7 +1222,7 @@ define <2 x double> @negate_if_true_wrong_constant(<2 x double> %px, i1 %cond) {
 ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0
 define float @fmul_select(float %x, i1 %c) {
 ; CHECK-LABEL: @fmul_select(
-; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C:%.*]], float [[X:%.*]], float 0.000000e+00
 ; CHECK-NEXT:    ret float [[MUL]]
 ;
   %sel = select i1 %c, float 1.0, float 0.0
@@ -1233,7 +1233,7 @@ define float @fmul_select(float %x, i1 %c) {
 ; X *fast (C ? 1.0 : 0.0) -> C ? X : 0.0
 define <2 x float> @fmul_select_vec(<2 x float> %x, i1 %c) {
 ; CHECK-LABEL: @fmul_select_vec(
-; CHECK-NEXT:    [[MUL:%.*]] = select fast i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer
+; CHECK-NEXT:    [[MUL:%.*]] = select i1 [[C:%.*]], <2 x float> [[X:%.*]], <2 x float> zeroinitializer
 ; CHECK-NEXT:    ret <2 x float> [[MUL]]
 ;
   %sel = select i1 %c, <2 x float> <float 1.0, float 1.0>, <2 x float> zeroinitializer
diff --git a/llvm/test/Transforms/InstCombine/free-inversion.ll b/llvm/test/Transforms/InstCombine/free-inversion.ll
index 4b69a5e77b4ce..2e8e75c3ab3ef 100644
--- a/llvm/test/Transforms/InstCombine/free-inversion.ll
+++ b/llvm/test/Transforms/InstCombine/free-inversion.ll
@@ -563,10 +563,10 @@ define i1 @test_inv_free(i1 %c1, i1 %c2, i1 %c3, i1 %c4) {
 ; CHECK:       b2:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       b3:
+; CHECK-NEXT:    [[TMP0:%.*]] = and i1 [[C3:%.*]], [[C4:%.*]]
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
-; CHECK-NEXT:    [[VAL_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ true, [[B2]] ], [ [[C3:%.*]], [[B3]] ]
-; CHECK-NEXT:    [[COND_NOT:%.*]] = and i1 [[VAL_NOT]], [[C4:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = phi i1 [ false, [[B1]] ], [ [[C4]], [[B2]] ], [ [[TMP0]], [[B3]] ]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[B5:%.*]], label [[B4:%.*]]
 ; CHECK:       b4:
 ; CHECK-NEXT:    ret i1 true
diff --git a/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll
new file mode 100644
index 0000000000000..a3b21ccc63e94
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/or-select-zero-icmp.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; Basic functional test
+define i32 @basic(i32 %a, i32 %b) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[A]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Operand order swap test
+define i32 @swap_operand_order(i32 %x, i32 %y) {
+; CHECK-LABEL: @swap_operand_order(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %y, i32 0
+  %or = or i32 %x, %sel
+  ret i32 %or
+}
+
+; Negative test: Non-zero false value in select
+define i32 @negative_non_zero_false_val(i32 %a, i32 %b) {
+; CHECK-LABEL: @negative_non_zero_false_val(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %cmp = icmp eq i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 1
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Negative test: Incorrect comparison predicate (NE)
+define i32 @negative_wrong_predicate(i32 %a, i32 %b) {
+; CHECK-LABEL: @negative_wrong_predicate(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[CMP]], i32 0, i32 [[TMP1:%.*]]
+; CHECK-NEXT:    [[OR1:%.*]] = or i32 [[OR]], [[A]]
+; CHECK-NEXT:    ret i32 [[OR1]]
+;
+  %cmp = icmp ne i32 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %a
+  ret i32 %or
+}
+
+; Comparison direction swap test (0 == X)
+define i32 @cmp_swapped(i32 %x, i32 %y) {
+; CHECK-LABEL: @cmp_swapped(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[X]], [[SEL]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 0, %x
+  %sel = select i1 %cmp, i32 %y, i32 0
+  %or = or i32 %x, %sel
+  ret i32 %or
+}
+
+; Complex expression test
+define i32 @complex_expression(i32 %a, i32 %b) {
+; CHECK-LABEL: @complex_expression(
+; CHECK-NEXT:    [[X:%.*]] = add i32 [[A:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 [[X]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %x = add i32 %a, 1
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %x
+  ret i32 %or
+}
+
+; zext test
+define i32 @zext_cond(i8 %a, i32 %b) {
+; CHECK-LABEL: @zext_cond(
+; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[Z]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %z = zext i8 %a to i32
+  %cmp = icmp eq i8 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %z
+  ret i32 %or
+}
+
+; sext test
+define i32 @sext_cond(i8 %a, i32 %b) {
+; CHECK-LABEL: @sext_cond(
+; CHECK-NEXT:    [[S:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[B:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[S]]
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %s = sext i8 %a to i32
+  %cmp = icmp eq i8 %a, 0
+  %sel = select i1 %cmp, i32 %b, i32 0
+  %or = or i32 %sel, %s
+  ret i32 %or
+}
+
+; Vector type test
+define <2 x i32> @vector_type(<2 x i32> %a, <2 x i32> %b) {
+; CHECK-LABEL: @vector_type(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq <2 x i32> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[RES:%.*]] = select <2 x i1> [[CMP]], <2 x i32> [[B:%.*]], <2 x i32> [[A]]
+; CHECK-NEXT:    ret <2 x i32> [[RES]]
+;
+  %cmp = icmp eq <2 x i32> %a, zeroinitializer
+  %sel = select <2 x i1> %cmp, <2 x i32> %b, <2 x i32> zeroinitializer
+  %or = or <2 x i32> %sel, %a
+  ret <2 x i32> %or
+}
+
+; Pointer type test (should not trigger optimization)
+define ptr @pointer_type(ptr %p, ptr %q) {
+; CHECK-LABEL: @pointer_type(
+; CHECK-NEXT:    [[A:%.*]] = ptrtoint ptr [[P:%.*]] to i64
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[P]], null
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], ptr [[Q:%.*]], ptr null
+; CHECK-NEXT:    [[SEL_INT:%.*]] = ptrtoint ptr [[SEL]] to i64
+; CHECK-NEXT:    [[OR:%.*]] = or i64 [[A]], [[SEL_INT]]
+; CHECK-NEXT:    [[RET:%.*]] = inttoptr i64 [[OR]] to ptr
+; CHECK-NEXT:    ret ptr [[RET]]
+;
+  %a = ptrtoint ptr %p to i64
+  %cmp = icmp eq i64 %a, 0
+  %sel = select i1 %cmp, ptr %q, ptr null
+  %sel_int = ptrtoint ptr %sel to i64
+  %or_val = or i64 %a, %sel_int
+  %ret = inttoptr i64 %or_val to ptr
+  ret ptr %ret
+}
+
+; Multi-use test (should not trigger optimization)
+define i32 @multi_use_test(i32 %x, i32 %m) {
+; CHECK-LABEL: @multi_use_test(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[M:%.*]], i32 0
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEL]], [[X]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEL]], [[X]]
+; CHECK-NEXT:    [[O2:%.*]] = sub i32 [[OR]], [[ADD]]
+; CHECK-NEXT:    ret i32 [[O2]]
+;
+  %cmp = icmp eq i32 %x, 0
+  %sel = select i1 %cmp, i32 %m, i32 0
+  %or = or i32 %sel, %x
+  %add = add i32 %sel, %x
+  %res = sub i32 %or, %add
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/InstCombine/recurrence.ll b/llvm/test/Transforms/InstCombine/recurrence.ll
index f75e0d439c572..643e7efc243a3 100644
--- a/llvm/test/Transforms/InstCombine/recurrence.ll
+++ b/llvm/test/Transforms/InstCombine/recurrence.ll
@@ -24,9 +24,9 @@ loop:                                             ; preds = %loop, %entry
 define i64 @test_or2(i64 %a, i64 %b) {
 ; CHECK-LABEL: @test_or2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = or i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    tail call void @use(i64 [[IV_NEXT]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
@@ -104,9 +104,9 @@ loop:                                             ; preds = %loop, %entry
 define i64 @test_and2(i64 %a, i64 %b) {
 ; CHECK-LABEL: @test_and2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV_NEXT:%.*]] = and i64 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    tail call void @use(i64 [[IV_NEXT]])
 ; CHECK-NEXT:    br label [[LOOP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
index 3d97048f43127..8b3c0502ac04d 100644
--- a/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
+++ b/llvm/test/Transforms/InstCombine/select-safe-transforms.ll
@@ -256,27 +256,27 @@ define <2 x i1> @not_logical_or2(i1 %b, <2 x i32> %a) {
   ret <2 x i1> %and
 }
 
-define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF2]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
-  %and1 = select i1 %not, i1 %a, i1 false
-  %and2 = select i1 %c, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and1 = select i1 %not, i1 %a, i1 false, !prof!1
+  %and2 = select i1 %c, i1 %b, i1 false, !prof !2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !3
   ret i1 %or
 }
 
-define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0_and1(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF1]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %not, %a
-  %and2 = select i1 %c, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and2 = select i1 %c, i1 %b, i1 false, !prof !1
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !2
   ret i1 %or
 }
 
@@ -292,15 +292,15 @@ define i1 @bools_logical_commute0_and2(i1 %a, i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) {
+define i1 @bools_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools_logical_commute0_and1_and2(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[B:%.*]], i1 [[A:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %not, %a
   %and2 = and i1 %c, %b
-  %or = select i1 %and1, i1 true, i1 %and2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !1
   ret i1 %or
 }
 
@@ -457,27 +457,27 @@ define i1 @bools_logical_commute3_and1_and2(i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF1]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
-  %and1 = select i1 %c, i1 %a, i1 false
-  %and2 = select i1 %not, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and1 = select i1 %c, i1 %a, i1 false, !prof !1
+  %and2 = select i1 %not, i1 %b, i1 false, !prof !2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !3
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0_and1(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0_and1(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF2]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %c, %a
-  %and2 = select i1 %not, i1 %b, i1 false
-  %or = select i1 %and1, i1 true, i1 %and2
+  %and2 = select i1 %not, i1 %b, i1 false, !prof !1
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !2
   ret i1 %or
 }
 
@@ -493,15 +493,15 @@ define i1 @bools2_logical_commute0_and2(i1 %a, i1 %b, i1 %c) {
   ret i1 %or
 }
 
-define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) {
+define i1 @bools2_logical_commute0_and1_and2(i1 %a, i1 %b, i1 %c) !prof !0 {
 ; CHECK-LABEL: @bools2_logical_commute0_and1_and2(
-; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]]
+; CHECK-NEXT:    [[OR:%.*]] = select i1 [[C:%.*]], i1 [[A:%.*]], i1 [[B:%.*]], !prof [[PROF3]]
 ; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %not = xor i1 %c, -1
   %and1 = and i1 %c, %a
   %and2 = and i1 %not, %b
-  %or = select i1 %and1, i1 true, i1 %and2
+  %or = select i1 %and1, i1 true, i1 %and2, !prof !1
   ret i1 %or
 }
 
@@ -799,8 +799,11 @@ define <2 x i1> @not_logical_and2(i1 %b, <2 x i32> %a) {
 
 !0 = !{!"function_entry_count", i64 1000}
 !1 = !{!"branch_weights", i32 2, i32 3}
+!2 = !{!"branch_weights", i32 5, i32 7}
+!3 = !{!"branch_weights", i32 11, i32 13}
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i64 1000}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
 ; CHECK: [[PROF2]] = !{!"branch_weights", i32 3, i32 2}
+; CHECK: [[PROF3]] = !{!"unknown", !"instcombine"}
 ;.
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index ee70137e8fbd7..01da63fa5b0af 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -858,8 +858,7 @@ define i1 @_gep_phi2(ptr %str1, i64 %val2) {
 ; CHECK:       while.end.i:
 ; CHECK-NEXT:    br label [[_Z3FOOPKC_EXIT]]
 ; CHECK:       _Z3fooPKc.exit:
-; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ 0, [[LOR_LHS_FALSE_I]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = or i64 [[RETVAL_0_I]], [[VAL2:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i64 [ 1, [[WHILE_END_I]] ], [ [[VAL2:%.*]], [[LOR_LHS_FALSE_I]] ], [ [[VAL2]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i64 [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TOBOOL]]
 ;
diff --git a/llvm/test/Transforms/LoopFusion/pr164082.ll b/llvm/test/Transforms/LoopFusion/pr164082.ll
new file mode 100644
index 0000000000000..652557cef48f8
--- /dev/null
+++ b/llvm/test/Transforms/LoopFusion/pr164082.ll
@@ -0,0 +1,65 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-fusion -disable-output -stats < %s 2>&1 | FileCheck -check-prefix=STAT %s
+; STAT: 1 loop-fusion - Loops fused
+
+; C Code
+;
+;;  for (int i = 0; i < 100; ++i)
+;;      Array[i][i] = -i;
+;;  for (int row = 0; row < 100; ++row)
+;;      for (int col = 0; col < 100; ++col)
+;;          if (col != row)
+;;              Array[row][col] = row + col;
+;
+; Loop fusion should not crash anymore as now forgetBlockAndLoopDispositions()
+; is trigerred after mergeLatch() during the fusion.
+
+define i32 @forget_dispositions() nounwind {
+entry:
+  %Array = alloca [100 x [100 x i32]], align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv33 = phi i64 [ 0, %entry ], [ %indvars.iv.next34, %for.body ]
+  %0 = trunc i64 %indvars.iv33 to i32
+  %sub = sub i32 0, %0
+  %arrayidx2 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv33, i64 %indvars.iv33
+  store i32 %sub, ptr %arrayidx2, align 4
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, 100
+  br i1 %exitcond36, label %for.cond6.preheader, label %for.body
+
+for.cond6.preheader:                              ; preds = %for.body, %for.inc17
+  %indvars.iv29 = phi i64 [ %indvars.iv.next30, %for.inc17 ], [ 0, %for.body ]
+  br label %for.body8
+
+for.body8:                                        ; preds = %for.inc14, %for.cond6.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond6.preheader ], [ %indvars.iv.next, %for.inc14 ]
+  %1 = trunc i64 %indvars.iv to i32
+  %2 = trunc i64 %indvars.iv29 to i32
+  %cmp9 = icmp eq i32 %1, %2
+  br i1 %cmp9, label %for.inc14, label %if.then
+
+if.then:                                          ; preds = %for.body8
+  %3 = add i64 %indvars.iv, %indvars.iv29
+  %arrayidx13 = getelementptr inbounds [100 x [100 x i32]], ptr %Array, i64 0, i64 %indvars.iv29, i64 %indvars.iv
+  %4 = trunc i64 %3 to i32
+  store i32 %4, ptr %arrayidx13, align 4
+  br label %for.inc14
+
+for.inc14:                                        ; preds = %for.body8, %if.then
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv27 = trunc i64 %indvars.iv.next to i32
+  %exitcond28 = icmp eq i32 %lftr.wideiv27, 100
+  br i1 %exitcond28, label %for.inc17, label %for.body8
+
+for.inc17:                                        ; preds = %for.inc14
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, 100
+  br i1 %exitcond32, label %for.exit, label %for.cond6.preheader
+
+for.exit:                                    ; preds = %for.inc17
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll
index e8ea912246728..9deccf5352ea8 100644
--- a/llvm/test/Transforms/LoopIdiom/basic.ll
+++ b/llvm/test/Transforms/LoopIdiom/basic.ll
@@ -1620,5 +1620,5 @@ define noalias ptr @_ZN8CMSPULog9beginImplEja(ptr nocapture writeonly %0) local_
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
-; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
index 7f266a754d1bc..314cf38baae04 100644
--- a/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-avoid-partial.ll
@@ -85,6 +85,35 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !3
 }
 
+; LOOP-UNROLL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body
+; LOOP-UNROLL-NEXT: Loop Size = 4
+; LOOP-UNROLL-NEXT: Exiting block %for.body: TripCount=0, TripMultiple=1, BreakoutTrip=1
+; LOOP-UNROLL-NEXT: Trying runtime unrolling on Loop:
+; LOOP-UNROLL-NEXT: Loop at depth 1 containing: %for.body<header><exiting>,%for.cond<latch>
+; LOOP-UNROLL-NEXT: Using epilog remainder.
+; LOOP-UNROLL-NEXT: Loop latch not terminated by a conditional branch.
+; LOOP-UNROLL-NEXT: UNROLLING loop %for.body by 5!
+
+; LOOP-UNROLL-FULL-LABEL: Loop Unroll: F[pragma_unroll_count2] Loop %for.body
+; LOOP-UNROLL-FULL-NEXT: Loop Size = 4
+; LOOP-UNROLL-FULL-NEXT: Not attempting partial/runtime unroll in FullLoopUnroll
+define void @pragma_unroll_count2(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond, %entry
+  %i = phi i64 [ 0, %entry ], [ %inc, %for.cond ]
+  %cmp = icmp ult i64 %i, %n
+  br i1 %cmp, label %for.cond, label %for.cond.cleanup
+
+for.cond:                                         ; preds = %for.body
+  %inc = add i64 %i, 8
+  br label %for.body, !llvm.loop !3
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+}
+
 ; LOOP-UNROLL: llvm.loop.unroll.disable
 ; LOOP-UNROLL-FULL: llvm.loop.unroll.enable
 !0 = !{!"llvm.loop.unroll.enable"}
diff --git a/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
new file mode 100644
index 0000000000000..14f6da42df6b1
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/loop-probability-one.ll
@@ -0,0 +1,116 @@
+; Check that a loop probability of one (indicating an always infinite loop) does
+; not crash or otherwise break LoopUnroll behavior when it tries to compute new
+; probabilities from it.
+;
+; That case indicates an always infinite loop.  A remainder loop cannot be
+; calculated at run time when the original loop is infinite as infinity %
+; UnrollCount is undefined, so consistent remainder loop probabilities are
+; difficult or impossible to reason about.  The implementation chooses
+; probabilities indicating that all remainder loop iterations will always
+; execute.
+
+; DEFINE: %{unroll} = opt < %s -unroll-count=3 -passes=loop-unroll -S
+; DEFINE: %{rt} = %{unroll} -unroll-runtime
+
+; RUN: %{unroll} | FileCheck %s -check-prefix UNROLL
+; RUN: %{rt} -unroll-runtime-epilog=true | FileCheck %s -check-prefix EPILOG
+; RUN: %{rt} -unroll-runtime-epilog=false | FileCheck %s -check-prefix PROLOG
+
+define void @test(i32 %n) {
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %inc = add i32 %i, 1
+  %c = icmp slt i32 %inc, %n
+  br i1 %c, label %loop, label %end, !prof !0
+
+end:
+  ret void
+}
+
+
+!0 = !{!"branch_weights", i32 1, i32 0}
+
+; UNROLL: define void @test(i32 %n) {
+; UNROLL: entry:
+; UNROLL:   br label %loop
+; UNROLL: loop:
+; UNROLL:   br i1 %c, label %loop.1, label %end, !prof !0
+; UNROLL: loop.1:
+; UNROLL:   br i1 %c.1, label %loop.2, label %end, !prof !0
+; UNROLL: loop.2:
+; UNROLL:   br i1 %c.2, label %loop, label %end, !prof !0, !llvm.loop !1
+; UNROLL-NOT: loop.3
+; UNROLL: end:
+; UNROLL:   ret void
+; UNROLL: }
+;
+; Infinite unrolled loop.
+; UNROLL: !0 = !{!"branch_weights", i32 1, i32 0}
+
+; EPILOG: define void @test(i32 %n) {
+; EPILOG: entry:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil.preheader, label %entry.new, !prof !0
+; EPILOG: entry.new:
+; EPILOG:   br label %loop
+; EPILOG: loop:
+; EPILOG:   br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !1
+; EPILOG: end.unr-lcssa:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil.preheader, label %end, !prof !1
+; EPILOG: loop.epil.preheader:
+; EPILOG:   br label %loop.epil
+; EPILOG: loop.epil:
+; EPILOG:   br i1 %{{.*}}, label %loop.epil, label %end.epilog-lcssa, !prof !4
+; EPILOG: end.epilog-lcssa:
+; EPILOG:   br label %end
+; EPILOG: end:
+; EPILOG:   ret void
+; EPILOG: }
+;
+; Unrolled loop guard: Unrolled loop is always entered.
+; EPILOG: !0 = !{!"branch_weights", i32 0, i32 -2147483648}
+;
+; Unrolled loop latch: Unrolled loop is infinite.
+; Epilogue loop guard: Epilogue loop is always entered if unrolled loop exits.
+; EPILOG: !1 = !{!"branch_weights", i32 -2147483648, i32 0}
+;
+; Epilogue loop latch: Epilogue loop executes both of its 2 iterations.
+; EPILOG: !4 = !{!"branch_weights", i32 1073741824, i32 1073741824}
+
+; PROLOG: define void @test(i32 %n) {
+; PROLOG: entry:
+; PROLOG:   br i1 %{{.*}}, label %loop.prol.preheader, label %loop.prol.loopexit, !prof !0
+; PROLOG: loop.prol.preheader:
+; PROLOG:   br label %loop.prol
+; PROLOG: loop.prol:
+; PROLOG:   br i1 %{{.*}}, label %loop.prol, label %loop.prol.loopexit.unr-lcssa, !prof !1
+; PROLOG: loop.prol.loopexit.unr-lcssa:
+; PROLOG:   br label %loop.prol.loopexit
+; PROLOG: loop.prol.loopexit:
+; PROLOG:   br i1 %{{.*}}, label %end, label %entry.new, !prof !0
+; PROLOG: entry.new:
+; PROLOG:   br label %loop
+; PROLOG: loop:
+; PROLOG:   br i1 %{{.*}}, label %loop, label %end.unr-lcssa, !prof !4
+; PROLOG: end.unr-lcssa:
+; PROLOG:   br label %end
+; PROLOG: end:
+; PROLOG:   ret void
+; PROLOG: }
+;
+; FIXME: Branch weights still need to be fixed in the case of prologues (issue
+; #135812), so !0 and !1 do not yet match their comments below.  When we do
+; fix it, this test will hopefully catch any bug like issue #165998, which
+; impacted the case of epilogues.
+;
+; Prologue loop guard: Prologue loop is always entered.
+; Unrolled loop guard: Unrolled loop is always entered.
+; PROLOG: !0 = !{!"branch_weights", i32 1, i32 127}
+;
+; Prologue loop latch: Prologue loop executes both of its 2 iterations.
+; PROLOG: !1 = !{!"branch_weights", i32 0, i32 1}
+;
+; Unrolled loop latch: Unrolled loop is infinite.
+; PROLOG: !4 = !{!"branch_weights", i32 1, i32 0}
diff --git a/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll
new file mode 100644
index 0000000000000..4d378b0d22f7d
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/zeroed-branch-weights.ll
@@ -0,0 +1,30 @@
+; Check that zeroed branch weights do not crash or otherwise break basic
+; LoopUnroll behavior when it tries to compute a probability from them.
+
+; RUN: opt < %s -S -unroll-count=2 -passes='loop-unroll' 2>&1 | FileCheck %s
+
+define void @test() {
+entry:
+  br label %loop
+
+loop:
+  br i1 false, label %end, label %loop, !prof !0
+
+end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 0, i32 0}
+
+; CHECK: define void @test() {
+; CHECK: entry:
+; CHECK:   br label %loop
+; CHECK: loop:
+; CHECK:   br i1 false, label %end, label %loop.1, !prof !0
+; CHECK: loop.1:
+; CHECK:   br i1 false, label %end, label %loop, !prof !0, !llvm.loop !1
+; CHECK-NOT: loop.2
+; CHECK: end:
+; CHECK:   ret void
+; CHECK: }
+; CHECK: !0 = !{!"branch_weights", i32 0, i32 0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 199203a9f5cb0..1164778c19070 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -82,7 +82,7 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 8: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction   %exitcond.not = icmp eq i64 %i.iv.next, 16
 ; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 8: 27
+; CHECK: Cost for VF 8: 16
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 26e630f969ef3..0ee6b52a2450b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -55,44 +55,36 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:  entry:
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-NOI8MM:       vector.ph:
-; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-NOI8MM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
-; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]]
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-NOI8MM-NEXT:    [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-NOI8MM:       scalar.ph:
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-NOI8MM:       for.exit:
+; CHECK-NOI8MM-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %for.body
@@ -166,44 +158,36 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:  entry:
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-NOI8MM:       vector.ph:
-; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-NOI8MM-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
-; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
-; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP13]], i32 16
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP14]], [[TMP2]]
+; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-NOI8MM-NEXT:    [[TMP10]] = add <16 x i32> [[TMP8]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-NOI8MM:       scalar.ph:
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-NOI8MM:       for.exit:
+; CHECK-NOI8MM-NEXT:    ret i32 [[TMP15]]
 ;
 entry:
   br label %for.body
@@ -292,7 +276,7 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -387,7 +371,7 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NOI8MM-NEXT:    [[TMP13]] = add <16 x i32> [[TMP11]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NOI8MM-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
 ; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index b84763142b686..e74830700776c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -512,17 +512,23 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = mul <16 x i32> [[TMP6]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP7]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP7]], [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul <16 x i32> [[TMP12]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP13]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 4636c1b63da82..d77ca9875bf01 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -758,132 +758,87 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16]] = mul <vscale x 8 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP16]], i32 -1)
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP17]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP16]], i32 [[TMP22]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nuw i32 [[TMP24]], 8
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = sub i32 [[TMP25]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 8 x i32> [[TMP18]], i32 [[TMP26]]
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[TMP8]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 3
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP17]], i64 [[TMP20]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP21]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul <vscale x 8 x i32> [[TMP22]], [[TMP15]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[TMP24]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add <16 x i32> [[TMP9]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP29]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
-; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP31]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nuw i32 [[TMP33]], 8
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sub i32 [[TMP34]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP35]]
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = extractelement <16 x i32> [[TMP11]], i32 15
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[TMP13]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_not_loop_carried(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul nuw i32 [[TMP6]], 8
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 0, i32 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 8 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR:%.*]] = phi <16 x i32> [ <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP25]] = mul <vscale x 8 x i32> [[TMP23]], [[TMP16]]
-; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> [[VECTOR_RECUR]], <vscale x 8 x i32> [[TMP25]], i32 -1)
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP4]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = shufflevector <16 x i32> [[VECTOR_RECUR]], <16 x i32> [[TMP4]], <16 x i32> <i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP21]], 1
-; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x i32> [[TMP25]], i32 [[TMP22]]
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = mul nuw i32 [[TMP24]], 8
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = sub i32 [[TMP30]], 1
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = extractelement <vscale x 8 x i32> [[TMP27]], i32 [[TMP31]]
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i32 [[TMP8]]
 ;
 entry:
   br label %for.body
@@ -930,7 +885,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = add <vscale x 8 x i32> [[TMP16]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -968,7 +923,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP30]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 8
@@ -1000,7 +955,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -1085,7 +1040,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]])
@@ -1183,7 +1138,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
@@ -1253,7 +1208,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]])
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]])
@@ -1350,7 +1305,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[EXIT:%.*]]
@@ -1388,7 +1343,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = xor i1 [[TMP20]], true
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[EXIT:%.*]]
@@ -1426,7 +1381,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP19]], true
-; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[EXIT:%.*]]
@@ -1461,82 +1416,66 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw i64 [[TMP8]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul <vscale x 4 x i32> [[TMP12]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14]] = add <vscale x 4 x i32> [[TMP13]], [[VEC_PHI]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP9]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <16 x i32> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 4
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = extractelement <vscale x 4 x i32> [[TMP12]], i32 [[TMP19]]
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVE1:       scalar.ph:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15
+; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVE1:       for.exit:
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = add i32 [[TMP7]], [[TMP8]]
+; CHECK-INTERLEAVE1-NEXT:    ret i32 [[RESULT]]
 ;
 ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw i64 [[TMP6]], 8
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP15]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP3]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP17]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i8>, ptr [[TMP18]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD3]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD4]] to <vscale x 4 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul <vscale x 4 x i32> [[TMP19]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul <vscale x 4 x i32> [[TMP20]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add <vscale x 4 x i32> [[TMP21]], [[VEC_PHI]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 4 x i32> [[TMP22]], [[VEC_PHI1]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = add <16 x i32> [[TMP15]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nuw i32 [[TMP27]], 4
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sub i32 [[TMP28]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = extractelement <vscale x 4 x i32> [[TMP20]], i32 [[TMP29]]
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-INTERLEAVED:       scalar.ph:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15
+; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-INTERLEAVED:       for.exit:
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = add i32 [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    ret i32 [[RESULT]]
 ;
 ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
@@ -1561,7 +1500,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1616,7 +1555,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
@@ -1651,7 +1590,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
@@ -1685,7 +1624,7 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
@@ -1803,7 +1742,7 @@ define void @not_dotp_not_phi2(ptr %matrix, i32 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP23]] = add i32 [[TMP21]], [[TMP15]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP22]]
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -1915,7 +1854,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -1953,7 +1892,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -1968,31 +1907,27 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
 ; CHECK-MAXBW:       for.ph:
 ; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP9]], [[BROADCAST_SPLAT]]
-; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -2048,7 +1983,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = add <8 x i64> [[TMP4]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2086,7 +2021,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9]] = add <8 x i64> [[TMP7]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i64> [[TMP9]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]])
@@ -2101,31 +2036,27 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_PH:%.*]]
 ; CHECK-MAXBW:       for.ph:
 ; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = zext i16 [[B]] to i64
-; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[EXT_B]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[TMP7]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = mul nuw nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP7]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[BROADCAST_SPLAT]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[TMP3]] = add <8 x i64> [[TMP2]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP3]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -2186,7 +2117,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2232,7 +2163,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
@@ -2274,7 +2205,7 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
@@ -2396,7 +2327,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2496,7 +2427,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
@@ -2596,7 +2527,7 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
 ; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index 89819f2be4967..1cbec47d72203 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -17,18 +17,33 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-LABEL: @foo4(
 ; RV32-NEXT:  entry:
 ; RV32-NEXT:    br label [[VECTOR_MEMCHECK:%.*]]
+; RV32:       vector.scevcheck:
+; RV32-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 128, i32 624)
+; RV32-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; RV32-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; RV32-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 [[MUL_RESULT]]
+; RV32-NEXT:    [[TMP1:%.*]] = icmp ult ptr [[TMP0]], [[A]]
+; RV32-NEXT:    [[TMP2:%.*]] = or i1 [[TMP1]], [[MUL_OVERFLOW]]
+; RV32-NEXT:    [[MUL1:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 256, i32 624)
+; RV32-NEXT:    [[MUL_RESULT2:%.*]] = extractvalue { i32, i1 } [[MUL1]], 0
+; RV32-NEXT:    [[MUL_OVERFLOW3:%.*]] = extractvalue { i32, i1 } [[MUL1]], 1
+; RV32-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 [[MUL_RESULT2]]
+; RV32-NEXT:    [[TMP4:%.*]] = icmp ult ptr [[TMP3]], [[B]]
+; RV32-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW3]]
+; RV32-NEXT:    [[TMP6:%.*]] = or i1 [[TMP2]], [[TMP5]]
+; RV32-NEXT:    br i1 [[TMP6]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK1:%.*]]
 ; RV32:       vector.memcheck:
-; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880
 ; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
-; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752
-; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
-; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i32 79880
+; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B]], i32 159752
+; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
+; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
 ; RV32-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; RV32-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[A]], [[SCEVGEP2]]
 ; RV32-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[B]], [[SCEVGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
 ; RV32-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
-; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; RV32-NEXT:    br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; RV32:       vector.ph:
 ; RV32-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; RV32-NEXT:    [[TMP9:%.*]] = mul <vscale x 2 x i64> [[TMP7]], splat (i64 16)
@@ -43,25 +58,26 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP11]], i64 0
 ; RV32-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; RV32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i32> @llvm.vp.gather.nxv2i32.nxv2p0(<vscale x 2 x ptr> align 4 [[TMP13]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]]), !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
 ; RV32-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 2 x i32> [[WIDE_MASKED_GATHER]], splat (i32 100)
 ; RV32-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 2 x i64> [[VEC_IND]], splat (i64 1)
 ; RV32-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[B]], <vscale x 2 x i64> [[TMP15]]
-; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3:![0-9]+]]
+; RV32-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x double> @llvm.vp.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 [[TMP16]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]]
 ; RV32-NEXT:    [[TMP17:%.*]] = sitofp <vscale x 2 x i32> [[WIDE_MASKED_GATHER]] to <vscale x 2 x double>
 ; RV32-NEXT:    [[TMP18:%.*]] = fadd <vscale x 2 x double> [[WIDE_MASKED_GATHER6]], [[TMP17]]
 ; RV32-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[A]], <vscale x 2 x i64> [[VEC_IND]]
-; RV32-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META5:![0-9]+]], !noalias [[META7:![0-9]+]]
+; RV32-NEXT:    call void @llvm.vp.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> [[TMP18]], <vscale x 2 x ptr> align 8 [[TMP19]], <vscale x 2 x i1> [[TMP14]], i32 [[TMP10]]), !alias.scope [[META3]], !noalias [[META5]]
 ; RV32-NEXT:    [[AVL_NEXT]] = sub nuw i64 [[AVL]], [[TMP8]]
 ; RV32-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; RV32-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[AVL_NEXT]], 0
-; RV32-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; RV32-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; RV32:       middle.block:
 ; RV32-NEXT:    br label [[FOR_END:%.*]]
 ; RV32:       scalar.ph:
+; RV32-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ]
 ; RV32-NEXT:    br label [[FOR_BODY:%.*]]
 ; RV32:       for.body:
-; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; RV32-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; RV32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TRIGGER]], i64 [[INDVARS_IV]]
 ; RV32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; RV32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP21]], 100
@@ -78,7 +94,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32:       for.inc:
 ; RV32-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; RV32-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV32-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV32-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
 ; RV32:       for.end:
 ; RV32-NEXT:    ret void
 ;
@@ -146,7 +162,7 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV64:       for.inc:
 ; RV64-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; RV64-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; RV64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP12:![0-9]+]]
+; RV64-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP11:![0-9]+]]
 ; RV64:       for.end:
 ; RV64-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
index e6b74062ad765..a33f8eb920039 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/consecutive-ptr-cg-bug.ll
@@ -35,7 +35,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; This test was originally vectorized, but now SCEV is smart enough to prove
 ; that its trip count is 1, so it gets ignored by vectorizer.
 ; Function Attrs: uwtable
-define void @test_01(i1 %arg) {
+define void @test_01(ptr addrspace(1) %p, i1 %arg) {
   br label %.outer
 
 ; <label>:1:                                      ; preds = %2
@@ -57,8 +57,8 @@ define void @test_01(i1 %arg) {
   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
   %9 = add i32 %8, 2
   %10 = zext i32 %9 to i64
-  %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10
-  %12 = ashr i32 undef, %4
+  %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10
+  %12 = ashr i32 12, %4
   store i32 %12, ptr addrspace(1) %11, align 4
   %13 = add i32 %7, 1
   %14 = icmp sgt i32 %13, 61
@@ -74,7 +74,7 @@ define void @test_01(i1 %arg) {
 ; CHECK: store <4 x i32>
 
 ; Function Attrs: uwtable
-define void @test_02(i1 %arg) {
+define void @test_02(ptr addrspace(1) %p, i1 %arg) {
   br label %.outer
 
 ; <label>:1:                                      ; preds = %2
@@ -96,8 +96,8 @@ define void @test_02(i1 %arg) {
   %8 = phi i32 [ %.ph2, %.outer ], [ %7, %6 ]
   %9 = add i32 %8, 2
   %10 = zext i32 %9 to i64
-  %11 = getelementptr inbounds i32, ptr addrspace(1) undef, i64 %10
-  %12 = ashr i32 undef, %4
+  %11 = getelementptr inbounds i32, ptr addrspace(1) %p, i64 %10
+  %12 = ashr i32 12, %4
   store i32 %12, ptr addrspace(1) %11, align 4
   %13 = add i32 %7, 1
   %14 = icmp sgt i32 %13, 610
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
index 4cff8753ba9b1..239366c59470e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model-assert.ll
@@ -11,9 +11,9 @@
 target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-w64-windows-gnu"
 
-define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
+define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p, ptr %pend) #0 {
 ; CHECK-LABEL: define void @cff_index_load_offsets(
-; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: i1 [[COND:%.*]], i8 [[X:%.*]], ptr [[P:%.*]], ptr [[PEND:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br i1 [[COND]], label %[[IF_THEN:.*]], label %[[EXIT:.*]]
 ; CHECK:       [[IF_THEN]]:
@@ -26,14 +26,14 @@ define void @cff_index_load_offsets(i1 %cond, i8 %x, ptr %p) #0 {
 ; CHECK-NEXT:    [[CONV73:%.*]] = zext i8 [[TMP0]] to i32
 ; CHECK-NEXT:    [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
 ; CHECK-NEXT:    [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr undef, align 1, !tbaa [[CHAR_TBAA1]]
-; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[P]], align 1, !tbaa [[CHAR_TBAA1]]
+; CHECK-NEXT:    [[SHL78:%.*]] = shl nuw nsw i32 12, 8
 ; CHECK-NEXT:    [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
 ; CHECK-NEXT:    [[CONV81:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]]
-; CHECK-NEXT:    store i32 [[OR83]], ptr undef, align 4, !tbaa [[LONG_TBAA4:![0-9]+]]
+; CHECK-NEXT:    store i32 [[OR83]], ptr [[P]], align 4, !tbaa [[LONG_TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[ADD_PTR86]] = getelementptr inbounds i8, ptr [[P_359]], i64 4
-; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], undef
+; CHECK-NEXT:    [[CMP66:%.*]] = icmp ult ptr [[ADD_PTR86]], [[PEND]]
 ; CHECK-NEXT:    br i1 [[CMP66]], label %[[FOR_BODY68]], label %[[SW_EPILOG:.*]]
 ; CHECK:       [[SW_EPILOG]]:
 ; CHECK-NEXT:    unreachable
@@ -54,14 +54,14 @@ for.body68:                                       ; preds = %for.body68, %if.the
   %conv73 = zext i8 %0 to i32
   %shl74 = shl nuw nsw i32 %conv73, 16
   %or75 = or i32 %shl74, %shl71
-  %1 = load i8, ptr undef, align 1, !tbaa !1
-  %shl78 = shl nuw nsw i32 undef, 8
+  %1 = load i8, ptr %p, align 1, !tbaa !1
+  %shl78 = shl nuw nsw i32 12, 8
   %or79 = or i32 %or75, %shl78
   %conv81 = zext i8 %1 to i32
   %or83 = or i32 %or79, %conv81
-  store i32 %or83, ptr undef, align 4, !tbaa !4
+  store i32 %or83, ptr %p, align 4, !tbaa !4
   %add.ptr86 = getelementptr inbounds i8, ptr %p.359, i64 4
-  %cmp66 = icmp ult ptr %add.ptr86, undef
+  %cmp66 = icmp ult ptr %add.ptr86, %pend
   br i1 %cmp66, label %for.body68, label %sw.epilog
 
 sw.epilog:                                        ; preds = %for.body68
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
index e75d469506376..acec9e47a94ee 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaved-accesses-large-gap.ll
@@ -41,8 +41,8 @@ for.cond.cleanup:                                 ; preds = %for.body
 
 ; Make sure interleave groups with a key being the special 'empty' value for
 ; the map do not cause a crash.
-define void @test_gap_empty_key() {
-; CHECK-LABEL: @test_gap_empty_key()
+define void @test_gap_empty_key(ptr %p) {
+; CHECK-LABEL: @test_gap_empty_key(ptr %p)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label %for.body
 
@@ -57,7 +57,7 @@ entry:
 for.body:
   %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ]
   %iv.next = add nsw i64 %iv, 1
-  %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next
+  %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next
   %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next
   %G9 = getelementptr i32, ptr %G2, i32 -2147483647
   store i32 0, ptr %G2
@@ -71,8 +71,8 @@ exit:
 
 ; Make sure interleave groups with a key being the special 'tombstone' value for
 ; the map do not cause a crash.
-define void @test_tombstone_key() {
-; CHECK-LABEL: @test_tombstone_key()
+define void @test_tombstone_key(ptr %p) {
+; CHECK-LABEL: @test_tombstone_key(ptr %p)
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label %for.body
 
@@ -87,7 +87,7 @@ entry:
 for.body:
   %iv = phi i64 [ 1, %entry ], [ %iv.next, %for.body ]
   %iv.next = add nsw i64 %iv, 1
-  %arrayidx = getelementptr inbounds [3 x i32], ptr undef, i64 0, i64 %iv.next
+  %arrayidx = getelementptr inbounds [3 x i32], ptr %p, i64 0, i64 %iv.next
   %G2 = getelementptr i32, ptr %arrayidx, i64 %iv.next
   %G9 = getelementptr i32, ptr %G2, i32 -2147483648
   store i32 0, ptr %G2
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index efcc0005acaa3..f9570405ecabc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -711,17 +711,92 @@ define dso_local void @masked_strided3_optsize_unknown_tc(ptr noalias nocapture
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE16:%.*]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE16]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP0:%.*]] = icmp ule <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP1:%.*]] = icmp ugt <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP0]], <8 x i1> [[TMP1]], <8 x i1> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP2:%.*]] = mul i32 [[INDEX]], 3
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[TMP2]]
-; ENABLED_MASKED_STRIDED-NEXT:    [[INTERLEAVED_MASK:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> poison, <24 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 5, i32 5, i32 5, i32 6, i32 6, i32 6, i32 7, i32 7, i32 7>
-; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = and <24 x i1> [[INTERLEAVED_MASK]], <i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false>
-; ENABLED_MASKED_STRIDED-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <24 x i8> @llvm.masked.load.v24i8.p0(ptr align 1 [[TMP3]], <24 x i1> [[TMP5]], <24 x i8> poison)
-; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <24 x i8> [[WIDE_MASKED_VEC]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP3:%.*]] = mul nsw <8 x i32> [[VEC_IND]], splat (i32 3)
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP51:%.*]] = extractelement <8 x i1> [[TMP4]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP51]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP5:%.*]] = extractelement <8 x i32> [[TMP3]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i32 [[TMP5]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP53:%.*]] = load i8, ptr [[TMP52]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP8:%.*]] = insertelement <8 x i8> poison, i8 [[TMP53]], i64 0
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP9:%.*]] = phi <8 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP10:%.*]] = extractelement <8 x i1> [[TMP4]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if3:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP11:%.*]] = extractelement <8 x i32> [[TMP3]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP11]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP14:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[TMP13]], i64 1
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE4]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue4:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP15:%.*]] = phi <8 x i8> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP16:%.*]] = extractelement <8 x i1> [[TMP4]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if5:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP17:%.*]] = extractelement <8 x i32> [[TMP3]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP17]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP19:%.*]] = load i8, ptr [[TMP18]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP20:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[TMP19]], i64 2
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE6]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue6:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP21:%.*]] = phi <8 x i8> [ [[TMP15]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP20]], [[PRED_LOAD_IF5]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP22:%.*]] = extractelement <8 x i1> [[TMP4]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP22]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if7:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP23:%.*]] = extractelement <8 x i32> [[TMP3]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP23]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP24]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP26:%.*]] = insertelement <8 x i8> [[TMP21]], i8 [[TMP25]], i64 3
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE8]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue8:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP27:%.*]] = phi <8 x i8> [ [[TMP21]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP26]], [[PRED_LOAD_IF7]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP28:%.*]] = extractelement <8 x i1> [[TMP4]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP28]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if9:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP29:%.*]] = extractelement <8 x i32> [[TMP3]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP29]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP31:%.*]] = load i8, ptr [[TMP30]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP32:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[TMP31]], i64 4
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE10]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue10:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP33:%.*]] = phi <8 x i8> [ [[TMP27]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP32]], [[PRED_LOAD_IF9]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP34:%.*]] = extractelement <8 x i1> [[TMP4]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP34]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if11:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP35:%.*]] = extractelement <8 x i32> [[TMP3]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP35]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP38:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP37]], i64 5
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE12]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue12:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP39:%.*]] = phi <8 x i8> [ [[TMP33]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP38]], [[PRED_LOAD_IF11]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP40:%.*]] = extractelement <8 x i1> [[TMP4]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP40]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14:%.*]]
+; ENABLED_MASKED_STRIDED:       pred.load.if13:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP41:%.*]] = extractelement <8 x i32> [[TMP3]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP41]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP44:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[TMP43]], i64 6
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE14]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue14:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP45:%.*]] = phi <8 x i8> [ [[TMP39]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP44]], [[PRED_LOAD_IF13]] ]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP46:%.*]] = extractelement <8 x i1> [[TMP4]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP46]], label [[PRED_LOAD_IF15:%.*]], label [[PRED_LOAD_CONTINUE16]]
+; ENABLED_MASKED_STRIDED:       pred.load.if15:
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP47:%.*]] = extractelement <8 x i32> [[TMP3]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP48:%.*]] = getelementptr inbounds i8, ptr [[P]], i32 [[TMP47]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1
+; ENABLED_MASKED_STRIDED-NEXT:    [[TMP50:%.*]] = insertelement <8 x i8> [[TMP45]], i8 [[TMP49]], i64 7
+; ENABLED_MASKED_STRIDED-NEXT:    br label [[PRED_LOAD_CONTINUE16]]
+; ENABLED_MASKED_STRIDED:       pred.load.continue16:
+; ENABLED_MASKED_STRIDED-NEXT:    [[STRIDED_VEC:%.*]] = phi <8 x i8> [ [[TMP45]], [[PRED_LOAD_CONTINUE14]] ], [ [[TMP50]], [[PRED_LOAD_IF15]] ]
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[Q:%.*]], i32 [[INDEX]]
 ; ENABLED_MASKED_STRIDED-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[STRIDED_VEC]], ptr align 1 [[TMP6]], <8 x i1> [[TMP4]])
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
index 41756ffb64e6c..8744e45344242 100644
--- a/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/demanded-bits-of-pointer-instruction.ll
@@ -4,13 +4,13 @@
 ; Only make sure we do not crash.
 
 ; CHECK: @test
-define void @test(ptr %ptr, ptr %ptr_end) {
+define void @test(i8 %v, ptr %ptr, ptr %ptr_end) {
 start:
   br label %loop
 
 loop:
   %ptr2 = phi ptr [ %ptr3, %loop ], [ %ptr, %start ]
-  %x = sext i8 undef to i64
+  %x = sext i8 %v to i64
   %ptr3 = getelementptr inbounds i8, ptr %ptr2, i64 1
   %cmp = icmp ult ptr %ptr3, %ptr_end
   br i1 %cmp, label %loop, label %end
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index c164c4a46bd94..e7913c583b938 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -384,15 +384,15 @@ for.inc26:
 ; conditional store to remain scalar. Since we can only type-shrink vector
 ; types, we shouldn't try to represent the expression in a smaller type.
 ;
-define void @minimal_bit_widths(i1 %c) {
+define void @minimal_bit_widths(ptr %p, i1 %c) {
 ; UNROLL-LABEL: @minimal_bit_widths(
 ; UNROLL-NEXT:  entry:
 ; UNROLL-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; UNROLL:       vector.body:
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]]
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
 ; UNROLL-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
 ; UNROLL-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
 ; UNROLL-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
@@ -415,8 +415,8 @@ define void @minimal_bit_widths(i1 %c) {
 ; UNROLL-NOSIMPLIFY:       vector.body:
 ; UNROLL-NOSIMPLIFY-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
-; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
+; UNROLL-NOSIMPLIFY-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP1]]
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP2]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    [[TMP5:%.*]] = load i8, ptr [[TMP3]], align 1
 ; UNROLL-NOSIMPLIFY-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
@@ -442,16 +442,16 @@ define void @minimal_bit_widths(i1 %c) {
 ; VEC-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VEC:       vector.body:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
-; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr undef, i64 [[INDEX]]
+; VEC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[INDEX]]
 ; VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP1]], align 1
 ; VEC-NEXT:    br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if:
 ; VEC-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 0
-; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP8]]
+; VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
 ; VEC-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
 ; VEC-NEXT:    store i8 [[TMP4]], ptr [[TMP3]], align 1
 ; VEC-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr undef, i64 [[TMP5]]
+; VEC-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP5]]
 ; VEC-NEXT:    [[TMP7:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
 ; VEC-NEXT:    store i8 [[TMP7]], ptr [[TMP6]], align 1
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
@@ -468,7 +468,7 @@ entry:
 for.body:
   %tmp0 = phi i64 [ %tmp6, %for.inc ], [ 0, %entry ]
   %tmp1 = phi i64 [ %tmp7, %for.inc ], [ 1000, %entry ]
-  %tmp2 = getelementptr i8, ptr undef, i64 %tmp0
+  %tmp2 = getelementptr i8, ptr %p, i64 %tmp0
   %tmp3 = load i8, ptr %tmp2, align 1
   br i1 %c, label %if.then, label %for.inc
 
diff --git a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
index 106054d989776..d87d9b155db1c 100644
--- a/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/nsw-crash.ll
@@ -3,7 +3,7 @@
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-define void @test(i1 %arg) {
+define void @test(ptr %p, i1 %arg) {
 entry:
   br i1 %arg, label %while.end, label %while.body.lr.ph
 
@@ -11,7 +11,7 @@ while.body.lr.ph:
   br label %while.body
 
 while.body:
-  %it.sroa.0.091 = phi ptr [ undef, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
+  %it.sroa.0.091 = phi ptr [ %p, %while.body.lr.ph ], [ %incdec.ptr.i, %while.body ]
   %incdec.ptr.i = getelementptr inbounds i32, ptr %it.sroa.0.091, i64 1
   %inc32 = add i32 undef, 1                                        ; <------------- Make sure we don't set NSW flags to the undef.
   %cmp.i11 = icmp eq ptr %incdec.ptr.i, undef
diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll
index 763072ab16f73..f9f7feb7bdfbc 100644
--- a/llvm/test/Transforms/LoopVectorize/optsize.ll
+++ b/llvm/test/Transforms/LoopVectorize/optsize.ll
@@ -248,25 +248,27 @@ for.end:                                          ; preds = %for.body
 ;
 @cm_array = external global [2592 x i16], align 1
 
-define void @pr43371() optsize {
+define void @pr43371(i16 %val) optsize {
 ;
 ; CHECK-LABEL: define void @pr43371(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -277,22 +279,24 @@ define void @pr43371() optsize {
 ; CHECK-NEXT:    unreachable
 ;
 ; PGSO-LABEL: define void @pr43371(
-; PGSO-SAME: ) #[[ATTR0]] {
+; PGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; PGSO-NEXT:  [[ENTRY:.*:]]
 ; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; PGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PGSO:       [[VECTOR_BODY]]:
 ; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; PGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -303,22 +307,24 @@ define void @pr43371() optsize {
 ; PGSO-NEXT:    unreachable
 ;
 ; NPGSO-LABEL: define void @pr43371(
-; NPGSO-SAME: ) #[[ATTR0]] {
+; NPGSO-SAME: i16 [[VAL:%.*]]) #[[ATTR0]] {
 ; NPGSO-NEXT:  [[ENTRY:.*:]]
 ; NPGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; NPGSO:       [[VECTOR_PH]]:
+; NPGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; NPGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; NPGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; NPGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; NPGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; NPGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; NPGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; NPGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; NPGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; NPGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; NPGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; NPGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; NPGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; NPGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; NPGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; NPGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -340,7 +346,7 @@ for.cond.cleanup28:
 
 for.body29:
   %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
-  %add33 = add i16 undef, %i24.0170
+  %add33 = add i16 %val, %i24.0170
   %idxprom34 = zext i16 %add33 to i32
   %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34
   store i16 0, ptr %arrayidx35, align 1
@@ -349,25 +355,27 @@ for.body29:
   br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
 }
 
-define void @pr43371_pgso() !prof !14 {
+define void @pr43371_pgso(i16 %val) !prof !14 {
 ;
 ; CHECK-LABEL: define void @pr43371_pgso(
-; CHECK-SAME: ) !prof [[PROF14]] {
+; CHECK-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; CHECK-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; CHECK-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -378,22 +386,24 @@ define void @pr43371_pgso() !prof !14 {
 ; CHECK-NEXT:    unreachable
 ;
 ; PGSO-LABEL: define void @pr43371_pgso(
-; PGSO-SAME: ) !prof [[PROF14]] {
+; PGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; PGSO-NEXT:  [[ENTRY:.*:]]
 ; PGSO-NEXT:    br label %[[VECTOR_PH:.*]]
 ; PGSO:       [[VECTOR_PH]]:
+; PGSO-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[VAL]], i64 0
+; PGSO-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; PGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PGSO:       [[VECTOR_BODY]]:
 ; PGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PGSO-NEXT:    [[VEC_IND:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> undef, [[VEC_IND]]
+; PGSO-NEXT:    [[TMP0:%.*]] = add <2 x i16> [[BROADCAST_SPLAT]], [[VEC_IND]]
 ; PGSO-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[TMP0]] to <2 x i32>
-; PGSO-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
-; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
-; PGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
+; PGSO-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; PGSO-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
 ; PGSO-NEXT:    [[TMP5:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP4]]
-; PGSO-NEXT:    store i16 0, ptr [[TMP3]], align 1
+; PGSO-NEXT:    [[TMP7:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP3]]
 ; PGSO-NEXT:    store i16 0, ptr [[TMP5]], align 1
+; PGSO-NEXT:    store i16 0, ptr [[TMP7]], align 1
 ; PGSO-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
 ; PGSO-NEXT:    [[VEC_IND_NEXT]] = add <2 x i16> [[VEC_IND]], splat (i16 2)
 ; PGSO-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 756
@@ -404,17 +414,19 @@ define void @pr43371_pgso() !prof !14 {
 ; PGSO-NEXT:    unreachable
 ;
 ; NPGSO-LABEL: define void @pr43371_pgso(
-; NPGSO-SAME: ) !prof [[PROF14]] {
+; NPGSO-SAME: i16 [[VAL:%.*]]) !prof [[PROF14]] {
 ; NPGSO-NEXT:  [[ENTRY:.*:]]
 ; NPGSO-NEXT:    br label %[[VECTOR_SCEVCHECK:.*]]
 ; NPGSO:       [[VECTOR_SCEVCHECK]]:
-; NPGSO-NEXT:    br i1 undef, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NPGSO-NEXT:    [[TMP0:%.*]] = add i16 [[VAL]], 755
+; NPGSO-NEXT:    [[TMP4:%.*]] = icmp ult i16 [[TMP0]], [[VAL]]
+; NPGSO-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; NPGSO:       [[VECTOR_PH]]:
 ; NPGSO-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; NPGSO:       [[VECTOR_BODY]]:
 ; NPGSO-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; NPGSO-NEXT:    [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
-; NPGSO-NEXT:    [[TMP1:%.*]] = add i16 undef, [[OFFSET_IDX]]
+; NPGSO-NEXT:    [[TMP1:%.*]] = add i16 [[VAL]], [[OFFSET_IDX]]
 ; NPGSO-NEXT:    [[TMP2:%.*]] = zext i16 [[TMP1]] to i32
 ; NPGSO-NEXT:    [[TMP3:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[TMP2]]
 ; NPGSO-NEXT:    store <2 x i16> zeroinitializer, ptr [[TMP3]], align 1
@@ -429,7 +441,7 @@ define void @pr43371_pgso() !prof !14 {
 ; NPGSO-NEXT:    unreachable
 ; NPGSO:       [[FOR_BODY29]]:
 ; NPGSO-NEXT:    [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ]
-; NPGSO-NEXT:    [[ADD33:%.*]] = add i16 undef, [[I24_0170]]
+; NPGSO-NEXT:    [[ADD33:%.*]] = add i16 [[VAL]], [[I24_0170]]
 ; NPGSO-NEXT:    [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32
 ; NPGSO-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]]
 ; NPGSO-NEXT:    store i16 0, ptr [[ARRAYIDX35]], align 1
@@ -449,7 +461,7 @@ for.cond.cleanup28:
 
 for.body29:
   %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
-  %add33 = add i16 undef, %i24.0170
+  %add33 = add i16 %val, %i24.0170
   %idxprom34 = zext i16 %add33 to i32
   %arrayidx35 = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 %idxprom34
   store i16 0, ptr %arrayidx35, align 1
diff --git a/llvm/test/Transforms/LoopVectorize/pr32859.ll b/llvm/test/Transforms/LoopVectorize/pr32859.ll
index 2d30e0c9ad10f..f65e9cab1700b 100644
--- a/llvm/test/Transforms/LoopVectorize/pr32859.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr32859.ll
@@ -10,13 +10,13 @@
 ; CHECK: %e.0.ph = phi i32 [ 0, %if.end.2.i ], [ 0, %middle.block ]
 
 ; Function Attrs: nounwind uwtable
-define void @main(i32 %n) #0 {
+define void @main(i32 %n, i32 %v) #0 {
 entry:
   br label %for.cond1.preheader.i
 
 for.cond1.preheader.i:                            ; preds = %if.end.2.i, %entry
   %c.06.i = phi i32 [ 0, %entry ], [ %inc5.i, %if.end.2.i ]
-  %tobool.i = icmp ne i32 undef, 0
+  %tobool.i = icmp ne i32 %v, 0
   br label %if.end.2.i
 
 if.end.2.i:                                       ; preds = %for.cond1.preheader.i
diff --git a/llvm/test/Transforms/LoopVectorize/pr36311.ll b/llvm/test/Transforms/LoopVectorize/pr36311.ll
index f2dfecc341e6f..bc27e4e9b09cd 100644
--- a/llvm/test/Transforms/LoopVectorize/pr36311.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr36311.ll
@@ -10,10 +10,7 @@
 
 $test = comdat any
 
-declare i32 @__gxx_personality_v0(...)
-
-; Function Attrs: uwtable
-define dso_local void @test(i1 %arg) local_unnamed_addr #0 comdat align 2 personality ptr @__gxx_personality_v0 {
+define void @test(ptr %p, i1 %arg) {
 entry:
   br label %for.body51
 
@@ -26,9 +23,9 @@ for.cond80.loopexit:                              ; preds = %for.body89
 
 for.body89.lr.ph:                                 ; preds = %for.cond80.loopexit, %for.body51
   %i79.0179 = phi i32 [ %add90, %for.cond80.loopexit ], [ 0, %for.body51 ]
-  %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ undef, %for.body51 ]
+  %next_index.4178 = phi i32 [ %inc94.lcssa, %for.cond80.loopexit ], [ 0, %for.body51 ]
   %add90 = add nuw i32 %i79.0179, 1
-  %mul91 = mul i32 %add90, undef
+  %mul91 = mul i32 %add90, 7
   br label %for.body89
 
 for.body89:                                       ; preds = %for.body89, %for.body89.lr.ph
@@ -38,10 +35,10 @@ for.body89:                                       ; preds = %for.body89, %for.bo
   %add93 = add i32 %add92, %mul91
   %inc94 = add i32 %next_index.5174, 1
   %conv95 = zext i32 %next_index.5174 to i64
-  %arrayidx.i160 = getelementptr inbounds i32, ptr undef, i64 %conv95
+  %arrayidx.i160 = getelementptr inbounds i32, ptr %p, i64 %conv95
   store i32 %add93, ptr %arrayidx.i160, align 4
 ;, !tbaa !1
-  %cmp87 = icmp ult i32 %add92, undef
+  %cmp87 = icmp ult i32 %add92, 123
   br i1 %cmp87, label %for.body89, label %for.cond80.loopexit
 
 nrvo.skipdtor.loopexit:                           ; preds = %for.cond80.loopexit
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
index 0656cd2b2aa94..0fdc8fd6ad519 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-ptr.ll
@@ -15,7 +15,7 @@ define void @PR49215(ptr %p, ptr %q) {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult ptr [[Q:%.*]], [[G]]
 ; CHECK-NEXT:    [[UMIN]] = select i1 [[CMP2]], ptr [[Q]], ptr [[G]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], undef
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 123
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]]
 ; CHECK:       loopexit:
 ; CHECK-NEXT:    [[UMIN_LCSSA:%.*]] = phi ptr [ [[UMIN]], [[FOR_BODY]] ]
@@ -31,7 +31,7 @@ for.body:
   %cmp2 = icmp ult ptr %q, %g
   %umin = select i1 %cmp2, ptr %q, ptr %g
   %iv.next = add nuw nsw i64 %iv, 1
-  %exitcond = icmp eq i64 %iv.next, undef
+  %exitcond = icmp eq i64 %iv.next, 123
   br i1 %exitcond, label %loopexit, label %for.body
 
 loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index 13cc1b657d231..f01e562fe40c7 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
+define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) {
 ; CHECK-LABEL: @PR34687(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N:%.*]], 4
@@ -13,20 +13,30 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[X1:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP0]]
+; CHECK-NEXT:    [[PREDPHI1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT3]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], splat (i32 255)
-; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i32> [[TMP1]], [[BROADCAST_SPLAT4]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc <4 x i32> [[TMP2]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP4]] = zext <4 x i8> [[TMP3]] to <4 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], splat (i32 4)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
+; CHECK-NEXT:    [[PREDPHI:%.*]] = extractelement <4 x i32> [[PREDPHI1]], i32 3
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -36,17 +46,19 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; CHECK-NEXT:    [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[R_NEXT:%.*]], [[IF_END]] ]
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_END]]
+; CHECK-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[T0:%.*]] = sdiv i32 undef, undef
+; CHECK-NEXT:    [[T0:%.*]] = sdiv i32 [[I]], [[X]]
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
+; CHECK-NEXT:    [[DIV_PHI:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[T0]], [[IF_THEN]] ]
 ; CHECK-NEXT:    [[T1:%.*]] = and i32 [[R]], 255
 ; CHECK-NEXT:    [[I_NEXT]] = add nsw i32 [[I]], 1
-; CHECK-NEXT:    [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X]]
+; CHECK-NEXT:    [[R_NEXT]] = add nuw nsw i32 [[T1]], [[X1]]
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end:
+; CHECK-NEXT:    [[DIV_USE:%.*]] = phi i32 [ [[DIV_PHI]], [[IF_END]] ], [ [[PREDPHI]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[T2:%.*]] = phi i32 [ [[R_NEXT]], [[IF_END]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[T3:%.*]] = trunc i32 [[T2]] to i8
 ; CHECK-NEXT:    ret i8 [[T3]]
@@ -60,10 +72,11 @@ for.body:
   br i1 %c, label %if.then, label %if.end
 
 if.then:
-  %t0 = sdiv i32 undef, undef
+  %t0 = sdiv i32 %i, %divisor
   br label %if.end
 
 if.end:
+  %div_phi = phi i32 [ 0, %for.body ], [ %t0, %if.then ]
   %t1 = and i32 %r, 255
   %i.next = add nsw i32 %i, 1
   %r.next = add nuw nsw i32 %t1, %x
@@ -71,6 +84,7 @@ if.end:
   br i1 %cond, label %for.end, label %for.body
 
 for.end:
+  %div_use = phi i32 [ %div_phi, %if.end ]
   %t2 = phi i32 [ %r.next, %if.end ]
   %t3 = trunc i32 %t2 to i8
   ret i8 %t3
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
index c76c2c0ef47a2..ab10d62bc6048 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll
@@ -12,12 +12,12 @@ entry:
 
 loop:
   %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ]
-  %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr, i64 undef
+  %tmp4 = getelementptr inbounds %struct.foo, ptr %ptr
   store i64 0, ptr %tmp4, align 8
   %tmp8 = add i64 1, %tmp3
   %tmp10 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp8
   store i64 1, ptr %tmp10, align 8
-  %tmp14 = add i64 undef, %tmp3
+  %tmp14 = add i64 3, %tmp3
   %tmp16 = getelementptr inbounds %struct.foo, ptr %ptr, i64 %tmp14
   store i64 2, ptr %tmp16, align 8
   %tmp18 = add nuw nsw i64 %tmp3, 4
diff --git a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
index 1fccf546f4a67..d3cd80beaae90 100644
--- a/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
+++ b/llvm/test/Transforms/LoopVectorize/undef-inst-bug.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-LABEL: @t(
 ; CHECK: <4 x i32>
 
-define void @t() {
+define void @t(ptr %p) {
 entry:
   br label %for.body
 
@@ -22,13 +22,13 @@ for.body:
   %indvars.iv17 = phi i64 [ %indvars.next, %for.body ], [ 128, %entry ]
 
   ; Loop invariant anchored in loop.
-  %idxprom21 = zext i32 undef to i64
+  %idxprom21 = zext i32 0 to i64
 
-  %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr undef, i64 0, i64 %idxprom21, i64 %indvars.iv17
-  store i32 undef, ptr %arrayidx23, align 4
+  %arrayidx23 = getelementptr inbounds [100 x [100 x i32]], ptr %p, i64 0, i64 %idxprom21, i64 %indvars.iv17
+  store i32 poison, ptr %arrayidx23, align 4
   %indvars.next= add i64 %indvars.iv17, -1
   %0 = trunc i64 %indvars.next to i32
-  %cmp15 = icmp ugt i32 %0, undef
+  %cmp15 = icmp ugt i32 %0, poison
   br i1 %cmp15, label %for.body, label %loopexit
 
 loopexit:
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
index 67970c41f765e..0b6c4f32772f5 100644
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -385,7 +385,7 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-LABEL: define {{[^@]+}}@.omp_outlined..4
 ; CHECK-SAME: (ptr noalias nofree noundef nonnull readonly align 4 captures(none) dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
@@ -458,7 +458,7 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-SAME: (ptr noalias nofree readonly captures(none) [[DOTGLOBAL_TID_:%.*]], ptr noalias nofree readnone captures(none) [[DOTBOUND_TID_:%.*]], ptr nofree noundef nonnull align 4 captures(none) dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]]
-; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
@@ -534,7 +534,7 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]]
 ; CHECK-NEXT:    store i32 1, ptr [[A1]], align 4
 ; CHECK-NEXT:    store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
 ; CHECK-NEXT:    switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
 ; CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
@@ -646,10 +646,10 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
 ; CHECK-SAME: (ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG:%.*]], ptr nofree noundef nonnull readonly align 8 captures(none) dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !invariant.load [[META1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4, !invariant.load [[META1]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
index 55adda7d5b0f3..08191c636bd3f 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/predicated-reduction.ll
@@ -18,45 +18,45 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT19]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT14]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI15:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX1]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP23]], align 4
+; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI17:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI18:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD18]] to <4 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <4 x float> [[WIDE_LOAD19]] to <4 x double>
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul fast <4 x double> [[BROADCAST_SPLAT]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT20]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT20]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fsub fast <4 x double> [[TMP4]], [[BROADCAST_SPLAT15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = fsub fast <4 x double> [[TMP5]], [[BROADCAST_SPLAT15]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = fcmp fast ogt <4 x double> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = fcmp fast ogt <4 x double> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <4 x double> [[TMP6]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <4 x double> [[TMP7]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP6]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP13:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP7]], <4 x double> splat (double -0.000000e+00)
-; CHECK-NEXT:    [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP12]]
-; CHECK-NEXT:    [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI17]], [[TMP12]]
+; CHECK-NEXT:    [[TMP15]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI18]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = select ninf <4 x i1> [[TMP8]], <4 x double> [[TMP10]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP17:%.*]] = select ninf <4 x i1> [[TMP9]], <4 x double> [[TMP11]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP18]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI15]], [[TMP17]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8
+; CHECK-NEXT:    [[TMP19]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI16]], [[TMP17]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP20]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]])
-; CHECK-NEXT:    [[BIN_RDX21:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX21]])
+; CHECK-NEXT:    [[BIN_RDX20:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP22:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX20]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[FOR_BODY_PREHEADER22]]
 ; CHECK:       [[FOR_BODY_PREHEADER22]]:
@@ -65,11 +65,11 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[V0_010_PH:%.*]] = phi double [ 0.000000e+00, %[[FOR_BODY_PREHEADER]] ], [ [[TMP22]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ]
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY_PREHEADER22]] ]
 ; CHECK-NEXT:    [[V1_012:%.*]] = phi double [ [[V1_2:%.*]], %[[FOR_BODY]] ], [ [[V1_011_PH]], %[[FOR_BODY_PREHEADER22]] ]
 ; CHECK-NEXT:    [[V0_011:%.*]] = phi double [ [[V0_2:%.*]], %[[FOR_BODY]] ], [ [[V0_010_PH]], %[[FOR_BODY_PREHEADER22]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    [[CONV:%.*]] = fpext float [[TMP0]] to double
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul fast double [[Y]], [[CONV]]
 ; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double [[MUL]], [[Z]]
@@ -79,16 +79,16 @@ define nofpclass(nan inf) double @monte_simple(i32 noundef %nblocks, i32 noundef
 ; CHECK-NEXT:    [[V0_2]] = fadd reassoc arcp contract afn double [[V0_011]], [[ADD8]]
 ; CHECK-NEXT:    [[ADD4:%.*]] = select ninf i1 [[CMP1]], double [[MUL3]], double -0.000000e+00
 ; CHECK-NEXT:    [[V1_2]] = fadd reassoc arcp contract afn double [[V1_012]], [[ADD4]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[FOR_END_LOOPEXIT]]:
-; CHECK-NEXT:    [[V0_1:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1_1:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast double [[V1_1]], [[V0_1]]
+; CHECK-NEXT:    [[V0_1_LCSSA:%.*]] = phi double [ [[TMP22]], %[[MIDDLE_BLOCK]] ], [ [[V0_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_1_LCSSA:%.*]] = phi double [ [[TMP21]], %[[MIDDLE_BLOCK]] ], [ [[V1_2]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP24:%.*]] = fadd fast double [[V1_1_LCSSA]], [[V0_1_LCSSA]]
 ; CHECK-NEXT:    br label %[[FOR_END]]
 ; CHECK:       [[FOR_END]]:
-; CHECK-NEXT:    [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP1]], %[[FOR_END_LOOPEXIT]] ]
+; CHECK-NEXT:    [[ADD5:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP24]], %[[FOR_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    ret double [[ADD5]]
 ;
 entry:
@@ -193,29 +193,29 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 2147483640
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[Y]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT36:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT35]], <4 x double> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <4 x double> poison, double [[Z]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT30:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT29]], <4 x double> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br label %[[FOR_BODY_US:.*]]
 ; CHECK:       [[FOR_BODY_US]]:
-; CHECK-NEXT:    [[V1_021_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
-; CHECK-NEXT:    [[V0_020_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[V1_019_US:%.*]] = phi double [ [[V1_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US:.*]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
+; CHECK-NEXT:    [[V0_018_US:%.*]] = phi double [ [[V0_2_US_LCSSA:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[BLOCK_017_US:%.*]] = phi i32 [ [[INC9_US:%.*]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0, %[[FOR_BODY_US_PREHEADER]] ]
 ; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[FOR_BODY3_US_PREHEADER:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_021_US]], i64 0
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_020_US]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V1_019_US]], i64 0
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> <double poison, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, double [[V0_018_US]], i64 0
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x double> [ [[TMP2]], %[[VECTOR_PH]] ], [ [[TMP22:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI31:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP23:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI32:%.*]] = phi <4 x double> [ [[TMP27]], %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI33:%.*]] = phi <4 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP19:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US1]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX_US]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[ARRAYIDX_US]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD34:%.*]] = load <4 x float>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = fpext <4 x float> [[WIDE_LOAD]] to <4 x double>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fpext <4 x float> [[WIDE_LOAD34]] to <4 x double>
@@ -223,8 +223,8 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[TMP7:%.*]] = tail call fast <4 x double> @llvm.exp2.v4f64(<4 x double> [[TMP5]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <4 x double> [[TMP6]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul fast <4 x double> [[TMP7]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT36]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT36]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fsub fast <4 x double> [[TMP8]], [[BROADCAST_SPLAT30]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fsub fast <4 x double> [[TMP9]], [[BROADCAST_SPLAT30]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fcmp fast ogt <4 x double> [[TMP10]], zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fcmp fast ogt <4 x double> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul fast <4 x double> [[TMP10]], [[TMP10]]
@@ -237,26 +237,26 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[TMP21:%.*]] = select ninf <4 x i1> [[TMP13]], <4 x double> [[TMP15]], <4 x double> splat (double -0.000000e+00)
 ; CHECK-NEXT:    [[TMP22]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP23]] = fadd reassoc arcp contract afn <4 x double> [[VEC_PHI31]], [[TMP21]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV1]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP24]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP23]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX]])
-; CHECK-NEXT:    [[BIN_RDX37:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
-; CHECK-NEXT:    [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX37]])
+; CHECK-NEXT:    [[BIN_RDX35:%.*]] = fadd reassoc arcp contract afn <4 x double> [[TMP19]], [[TMP18]]
+; CHECK-NEXT:    [[TMP26:%.*]] = tail call reassoc arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[BIN_RDX35]])
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US_PREHEADER]]
 ; CHECK:       [[FOR_BODY3_US_PREHEADER]]:
 ; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, %[[FOR_BODY_US]] ], [ [[N_VEC]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[V1_114_US_PH:%.*]] = phi double [ [[V1_021_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[V0_113_US_PH:%.*]] = phi double [ [[V0_020_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[V1_114_US_PH:%.*]] = phi double [ [[V1_019_US]], %[[FOR_BODY_US]] ], [ [[TMP25]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[V0_113_US_PH:%.*]] = phi double [ [[V0_018_US]], %[[FOR_BODY_US]] ], [ [[TMP26]], %[[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY3_US:.*]]
 ; CHECK:       [[FOR_BODY3_US]]:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY3_US]] ], [ [[INDVARS_IV_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[V1_116_US:%.*]] = phi double [ [[V1_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V1_114_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[V0_115_US:%.*]] = phi double [ [[V0_2_US:%.*]], %[[FOR_BODY3_US]] ], [ [[V0_113_US_PH]], %[[FOR_BODY3_US_PREHEADER]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_US1:%.*]] = getelementptr inbounds nuw float, ptr [[SAMPLES]], i64 [[INDVARS_IV1]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US1]], align 4
 ; CHECK-NEXT:    [[CONV_US:%.*]] = fpext float [[TMP0]] to double
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call fast double @llvm.exp2.f64(double [[CONV_US]])
 ; CHECK-NEXT:    [[MUL_US:%.*]] = fmul fast double [[TMP1]], [[Y]]
@@ -267,7 +267,7 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[V0_2_US]] = fadd reassoc arcp contract afn double [[V0_115_US]], [[ADD12_US]]
 ; CHECK-NEXT:    [[ADD7_US1:%.*]] = select ninf i1 [[CMP4_US]], double [[ADD7_US]], double -0.000000e+00
 ; CHECK-NEXT:    [[V1_2_US]] = fadd reassoc arcp contract afn double [[V1_116_US]], [[ADD7_US1]]
-; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND25_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
 ; CHECK-NEXT:    br i1 [[EXITCOND25_NOT]], label %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]], label %[[FOR_BODY3_US]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[FOR_COND1_FOR_INC8_CRIT_EDGE_US]]:
@@ -275,17 +275,18 @@ define nofpclass(nan inf) double @monte_exp(i32 noundef %nblocks, i32 noundef %R
 ; CHECK-NEXT:    [[V1_2_US_LCSSA]] = phi double [ [[TMP25]], %[[MIDDLE_BLOCK]] ], [ [[V1_2_US]], %[[FOR_BODY3_US]] ]
 ; CHECK-NEXT:    [[INC9_US]] = add nuw nsw i32 [[BLOCK_017_US]], 1
 ; CHECK-NEXT:    [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC9_US]], [[NBLOCKS]]
-; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label %[[FOR_END10]], label %[[FOR_BODY_US]]
+; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label %[[FOR_END10_LOOPEXIT:.*]], label %[[FOR_BODY_US]]
 ; CHECK:       [[FOR_BODY]]:
 ; CHECK-NEXT:    [[BLOCK_017:%.*]] = phi i32 [ [[INC9:%.*]], %[[FOR_BODY]] ], [ 0, %[[FOR_BODY_LR_PH]] ]
 ; CHECK-NEXT:    tail call void @resample(i32 noundef [[RAND_BLOCK_LENGTH]], ptr noundef [[SAMPLES]])
 ; CHECK-NEXT:    [[INC9]] = add nuw nsw i32 [[BLOCK_017]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC9]], [[NBLOCKS]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_END10]], label %[[FOR_BODY]]
+; CHECK:       [[FOR_END10_LOOPEXIT]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast double [[V1_2_US_LCSSA]], [[V0_2_US_LCSSA]]
+; CHECK-NEXT:    br label %[[FOR_END10]]
 ; CHECK:       [[FOR_END10]]:
-; CHECK-NEXT:    [[V0_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V0_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[V1_0_LCSSA:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[V1_2_US_LCSSA]], %[[FOR_COND1_FOR_INC8_CRIT_EDGE_US]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
-; CHECK-NEXT:    [[ADD11:%.*]] = fadd fast double [[V1_0_LCSSA]], [[V0_0_LCSSA]]
+; CHECK-NEXT:    [[ADD11:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29]], %[[FOR_END10_LOOPEXIT]] ], [ 0.000000e+00, %[[FOR_BODY]] ]
 ; CHECK-NEXT:    ret double [[ADD11]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
index 09f583f9242d5..3416584729317 100644
--- a/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
+++ b/llvm/test/Transforms/PreISelIntrinsicLowering/AArch64/expand-exp.ll
@@ -38,5 +38,5 @@ define <4 x float> @fixed_vec_exp(<4 x float> %input) {
 declare <4 x float> @llvm.exp.v4f32(<4 x float>) #0
 declare <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float>) #0
 
-; CHECK: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
index c1a87f0c5f907..577efcbbac012 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vecreduceadd.ll
@@ -930,7 +930,7 @@ entry:
 
 
 ; COST-LABEL: Function:  mla_v8i8_i32
-; COST: Cost:            '-18'
+; COST: Cost:            '-24'
 define i32 @mla_v8i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
 ; CHECK-LABEL: @mla_v8i8_i32(
 ; CHECK-NEXT:  entry:
@@ -1009,7 +1009,7 @@ entry:
 
 
 ; COST-LABEL: Function:  mla_v16i8_i32
-; COST: Cost:            '-40'
+; COST: Cost:            '-52'
 define i32 @mla_v16i8_i32(ptr %x, ptr %y) "target-features"="+dotprod" {
 ; CHECK-LABEL: @mla_v16i8_i32(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll
new file mode 100644
index 0000000000000..590b0be973002
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-non-schedule-multi-use-in-binop.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S --mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+@a = common global [100 x i64] zeroinitializer, align 64
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = add <2 x i64> [[TMP0]], splat (i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> [[TMP1]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 1)
+; CHECK-NEXT:    br i1 false, label %[[LOP_RHSCNT_I_PEEL:.*]], label %[[LAND_END_I_PEEL:.*]]
+; CHECK:       [[LOP_RHSCNT_I_PEEL]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = or <2 x i64> [[TMP1]], <i64 1, i64 0>
+; CHECK-NEXT:    br label %[[LAND_END_I_PEEL]]
+; CHECK:       [[LAND_END_I_PEEL]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = phi <2 x i64> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP4]], %[[LOP_RHSCNT_I_PEEL]] ]
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %.promoted104.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8
+  %.promoted103.i = load i64, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+  %0 = add i64 %.promoted104.i, 1
+  %1 = add i64 %.promoted103.i, 1
+  %2 = add i64 %0, 1
+  br i1 false, label %lop.rhscnt.i.peel, label %land.end.i.peel
+
+lop.rhscnt.i.peel:
+  %3 = or i64 %1, 1
+  br label %land.end.i.peel
+
+land.end.i.peel:
+  %4 = phi i64 [ %2, %entry ], [ %0, %lop.rhscnt.i.peel ]
+  %5 = phi i64 [ %1, %entry ], [ %3, %lop.rhscnt.i.peel ]
+  store i64 %5, ptr getelementptr inbounds nuw (i8, ptr @a, i64 48), align 8
+  store i64 %4, ptr getelementptr inbounds nuw (i8, ptr @a, i64 56), align 8
+  ret void
+}
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
index 533b1f691f5ad..42b32e769d8d7 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll
@@ -1,26 +1,35 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 5
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
 ; RUN: opt -passes='loop(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='simple-loop-unswitch<nontrivial>' -simple-loop-unswitch-guards -S < %s | FileCheck %s
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -simple-loop-unswitch-guards  -verify-memoryssa -verify-loop-info -S < %s | FileCheck %s
 
 declare void @llvm.experimental.guard(i1, ...)
 
-define void @test_simple_case(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_simple_case(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br label [[LOOP_US:%.*]]
-; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US]]
-; CHECK:       guarded.us:
+define void @test_simple_case(i1 %cond, i32 %N) !prof !0 {
+; CHECK-LABEL: define void @test_simple_case(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
-; CHECK:       deopt:
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -38,29 +47,44 @@ exit:
 }
 
 define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_two_guards(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]]
-; CHECK:       entry.split.us.split.us:
-; CHECK-NEXT:    br label [[LOOP_US_US:%.*]]
-; CHECK:       loop.us.us:
-; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US_US:%.*]]
-; CHECK:       guarded.us.us:
-; CHECK-NEXT:    br label [[GUARDED_US2]]
-; CHECK:       guarded.us2:
+; CHECK-LABEL: define void @test_two_guards(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[ENTRY_SPLIT_US_SPLIT_US:.*]], label %[[ENTRY_SPLIT_US_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT_US_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US_US:.*]]
+; CHECK:       [[LOOP_US_US]]:
+; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US2:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US_US:.*]]
+; CHECK:       [[GUARDED_US_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US2]]
+; CHECK:       [[GUARDED_US2]]:
 ; CHECK-NEXT:    [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]]
-; CHECK:       deopt1:
+; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[EXIT_SPLIT_US_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[ENTRY_SPLIT_US_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US:.*]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    br label %[[DEOPT1:.*]]
+; CHECK:       [[DEOPT1]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       deopt:
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       exit:
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 
@@ -80,35 +104,46 @@ exit:
 }
 
 define void @test_conditional_guards(i1 %cond, i32 %N) {
-; CHECK-LABEL: @test_conditional_guards(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FROZEN:%.+]] = freeze i1 [[COND:%.*]]
-; CHECK-NEXT:    br i1 [[FROZEN]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:       entry.split.us:
-; CHECK-NEXT:    br label [[LOOP_US:%.*]]
-; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], [[BACKEDGE_US:%.*]] ]
+; CHECK-LABEL: define void @test_conditional_guards(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[COND]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[BACKEDGE_US:.*]] ]
 ; CHECK-NEXT:    [[CONDITION_US:%.*]] = icmp eq i32 [[IV_US]], 123
-; CHECK-NEXT:    br i1 [[CONDITION_US]], label [[GUARD_US:%.*]], label [[BACKEDGE_US]]
-; CHECK:       guard.us:
-; CHECK-NEXT:    br label [[GUARDED_US:%.*]]
-; CHECK:       backedge.us:
+; CHECK-NEXT:    br i1 [[CONDITION_US]], label %[[GUARD_US:.*]], label %[[BACKEDGE_US]]
+; CHECK:       [[GUARD_US]]:
+; CHECK-NEXT:    br label %[[GUARDED_US:.*]]
+; CHECK:       [[BACKEDGE_US]]:
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    br label %[[BACKEDGE_US]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], %[[BACKEDGE:.*]] ]
 ; CHECK-NEXT:    [[CONDITION:%.*]] = icmp eq i32 [[IV]], 123
-; CHECK-NEXT:    br i1 [[CONDITION]], label [[GUARD:%.*]], label [[BACKEDGE]]
-; CHECK:       guard:
-; CHECK-NEXT:    br label [[DEOPT:%.*]]
-; CHECK:       deopt:
+; CHECK-NEXT:    br i1 [[CONDITION]], label %[[GUARD:.*]], label %[[BACKEDGE]]
+; CHECK:       [[GUARD]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       backedge:
+; CHECK:       [[BACKEDGE]]:
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label %loop, label [[EXIT_SPLIT:%.*]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP]], label %[[EXIT_SPLIT:.*]]
+; CHECK:       [[EXIT_SPLIT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -133,53 +168,54 @@ exit:
 }
 
 define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
-; CHECK-LABEL: define void @test_nested_loop(i1 %cond, i32 %N, i1 %arg) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 %cond, label %entry.split, label %outer_loop.split
-; CHECK:       entry.split:
-; CHECK-NEXT:    br i1 %arg, label %entry.split.split.us, label %entry.split.split
-; CHECK:       entry.split.split.us:
-; CHECK-NEXT:    br label %outer_loop.us
-; CHECK:       outer_loop.us:
-; CHECK-NEXT:    br label %outer_loop.split.us.us
-; CHECK:       outer_backedge.us:
-; CHECK-NEXT:    br label %outer_loop.us
-; CHECK:       outer_loop.split.us.us:
-; CHECK-NEXT:    br label %loop.us.us
-; CHECK:       loop.us.us:
-; CHECK-NEXT:    %iv.us.us = phi i32 [ 0, %outer_loop.split.us.us ], [ %iv.next.us.us, %guarded.us.us ]
-; CHECK-NEXT:    br label %guarded.us.us
-; CHECK:       guarded.us.us:
-; CHECK-NEXT:    %iv.next.us.us = add i32 %iv.us.us, 1
-; CHECK-NEXT:    %loop.cond.us.us = icmp slt i32 %iv.next.us.us, %N
-; CHECK-NEXT:    br i1 %loop.cond.us.us, label %loop.us.us, label %outer_backedge.split.us.us
-; CHECK:       outer_backedge.split.us.us:
-; CHECK-NEXT:    br label %outer_backedge.us
-; CHECK:       entry.split.split:
-; CHECK-NEXT:    br label %outer_loop
-; CHECK:       outer_loop:
-; CHECK-NEXT:    br label %outer_loop.split.us
-; CHECK:       outer_loop.split.us:
-; CHECK-NEXT:    br label %loop.us
-; CHECK:       loop.us:
-; CHECK-NEXT:    %iv.us = phi i32 [ 0, %outer_loop.split.us ], [ %iv.next.us, %guarded.us ]
-; CHECK-NEXT:    br label %guarded.us
-; CHECK:       guarded.us:
-; CHECK-NEXT:    %iv.next.us = add i32 %iv.us, 1
-; CHECK-NEXT:    %loop.cond.us = icmp slt i32 %iv.next.us, %N
-; CHECK-NEXT:    br i1 %loop.cond.us, label %loop.us, label %outer_backedge.split.us
-; CHECK:       outer_backedge.split.us:
-; CHECK-NEXT:    br label %outer_backedge
-; CHECK:       outer_loop.split:
-; CHECK-NEXT:    br label %loop
-; CHECK:       loop:
-; CHECK-NEXT:    br label %deopt
-; CHECK:       deopt:
+; CHECK-LABEL: define void @test_nested_loop(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]], i1 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[OUTER_LOOP_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br i1 [[ARG]], label %[[ENTRY_SPLIT_SPLIT_US:.*]], label %[[ENTRY_SPLIT_SPLIT:.*]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_US:.*]]
+; CHECK:       [[OUTER_LOOP_US]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_SPLIT_US_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_US:.*]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_US]]
+; CHECK:       [[OUTER_LOOP_SPLIT_US_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US_US:.*]]
+; CHECK:       [[LOOP_US_US]]:
+; CHECK-NEXT:    [[IV_US_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US_US]] ], [ [[IV_NEXT_US_US:%.*]], %[[GUARDED_US_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US_US]]
+; CHECK:       [[GUARDED_US_US]]:
+; CHECK-NEXT:    [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US_US]], label %[[LOOP_US_US]], label %[[OUTER_BACKEDGE_SPLIT_US_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_SPLIT_US_US]]:
+; CHECK-NEXT:    br label %[[OUTER_BACKEDGE_US]]
+; CHECK:       [[ENTRY_SPLIT_SPLIT]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP:.*]]
+; CHECK:       [[OUTER_LOOP]]:
+; CHECK-NEXT:    br label %[[OUTER_LOOP_SPLIT_US:.*]]
+; CHECK:       [[OUTER_LOOP_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP_US:.*]]
+; CHECK:       [[LOOP_US]]:
+; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ 0, %[[OUTER_LOOP_SPLIT_US]] ], [ [[IV_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label %[[LOOP_US]], label %[[OUTER_BACKEDGE_SPLIT_US:.*]]
+; CHECK:       [[OUTER_BACKEDGE_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[OUTER_BACKEDGE:.*]]
+; CHECK:       [[OUTER_LOOP_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:       outer_backedge:
-; CHECK-NEXT:    br label %exit
-; CHECK:       exit:
+; CHECK:       [[OUTER_BACKEDGE]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
 
@@ -204,17 +240,50 @@ exit:
 }
 
 define void @test_sibling_loops(i1 %cond1, i1 %cond2, i32 %N) {
-; CHECK-LABEL: @test_sibling_loops(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
-; CHECK:         [[IV1_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], [[GUARDED_US:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US]]
-; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK-LABEL: define void @test_sibling_loops(
+; CHECK-SAME: i1 [[COND1:%.*]], i1 [[COND2:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND1]], label %[[ENTRY_SPLIT_US:.*]], label %[[ENTRY_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[ENTRY_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[LOOP1_US:.*]]
+; CHECK:       [[LOOP1_US]]:
+; CHECK-NEXT:    [[IV1_US:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT_US]] ], [ [[IV1_NEXT_US:%.*]], %[[GUARDED_US:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US]]
+; CHECK:       [[GUARDED_US]]:
+; CHECK-NEXT:    [[IV1_NEXT_US]] = add i32 [[IV1_US]], 1
+; CHECK-NEXT:    [[LOOP1_COND_US:%.*]] = icmp slt i32 [[IV1_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP1_COND_US]], label %[[LOOP1_US]], label %[[BETWEEN_SPLIT_US:.*]]
+; CHECK:       [[BETWEEN_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[BETWEEN:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP1:.*]]
+; CHECK:       [[LOOP1]]:
+; CHECK-NEXT:    br label %[[DEOPT:.*]]
+; CHECK:       [[DEOPT]]:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
-; CHECK:         [[IV2_US:%.*]] = phi i32 [ 0, [[BETWEEN:%.*]] ], [ [[IV1_NEXT_US2:%.*]], [[GUARDED_US2:%.*]] ]
-; CHECK-NEXT:    br label [[GUARDED_US2]]
-; CHECK:         call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+; CHECK:       [[BETWEEN]]:
+; CHECK-NEXT:    br i1 [[COND2]], label %[[BETWEEN_SPLIT_US2:.*]], label %[[BETWEEN_SPLIT:.*]], !prof [[PROF1]]
+; CHECK:       [[BETWEEN_SPLIT_US2]]:
+; CHECK-NEXT:    br label %[[LOOP2_US:.*]]
+; CHECK:       [[LOOP2_US]]:
+; CHECK-NEXT:    [[IV2_US:%.*]] = phi i32 [ 0, %[[BETWEEN_SPLIT_US2]] ], [ [[IV2_NEXT_US:%.*]], %[[GUARDED_US3:.*]] ]
+; CHECK-NEXT:    br label %[[GUARDED_US3]]
+; CHECK:       [[GUARDED_US3]]:
+; CHECK-NEXT:    [[IV2_NEXT_US]] = add i32 [[IV2_US]], 1
+; CHECK-NEXT:    [[LOOP2_COND_US:%.*]] = icmp slt i32 [[IV2_NEXT_US]], [[N]]
+; CHECK-NEXT:    br i1 [[LOOP2_COND_US]], label %[[LOOP2_US]], label %[[EXIT_SPLIT_US:.*]]
+; CHECK:       [[EXIT_SPLIT_US]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[BETWEEN_SPLIT]]:
+; CHECK-NEXT:    br label %[[LOOP2:.*]]
+; CHECK:       [[LOOP2]]:
+; CHECK-NEXT:    br label %[[DEOPT1:.*]]
+; CHECK:       [[DEOPT1]]:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
 ; CHECK-NEXT:    unreachable
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
 ;
 
 entry:
@@ -242,11 +311,21 @@ exit:
 }
 
 ; Check that we don't do anything because of cleanuppad.
-; CHECK-LABEL: @test_cleanuppad(
-; CHECK:       call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
-; CHECK-NOT:   call void (i1, ...) @llvm.experimental.guard(
 define void @test_cleanuppad(i1 %cond, i32 %N) personality ptr @__CxxFrameHandler3 {
-
+; CHECK-LABEL: define void @test_cleanuppad(
+; CHECK-SAME: i1 [[COND:%.*]], i32 [[N:%.*]]) personality ptr @__CxxFrameHandler3 {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[COND]]) [ "deopt"() ]
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    invoke void @may_throw(i32 [[IV]])
+; CHECK-NEXT:            to label %[[LOOP]] unwind label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[CP:%.*]] = cleanuppad within none []
+; CHECK-NEXT:    cleanupret from [[CP]] unwind to caller
+;
 entry:
   br label %loop
 
@@ -264,3 +343,9 @@ exit:
 
 declare void @may_throw(i32 %i)
 declare i32 @__CxxFrameHandler3(...)
+
+!0 = !{!"function_entry_count", i32 10}
+;.
+; CHECK: [[PROF0]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1048575, i32 1}
+;.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
index 536e0c6a0e74a..3c84dea2a0672 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll
@@ -2,40 +2,40 @@
 ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop(simple-loop-unswitch<nontrivial>),simplifycfg" | FileCheck %s
 ; RUN: opt < %s -S -simple-loop-unswitch-inject-invariant-conditions=true -passes="loop-mssa(simple-loop-unswitch<nontrivial>),simplifycfg" -verify-memoryssa | FileCheck %s
 
-define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
+define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) !prof !{!"function_entry_count", i32 10} {
 ; CHECK-LABEL: @test_01(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1:![0-9]+]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]]
-; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
+; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       loop.us:
-; CHECK-NEXT:    [[IV_US:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
-; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
-; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT_US:%.*]], [[GUARDED_US:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
+; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       guarded.us:
-; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
-; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
-; CHECK-NEXT:    store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4
-; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
+; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL]], [[X]]
+; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
+; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR_US]], align 4
+; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
-; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
-; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[EL_PTR1:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV1]]
+; CHECK-NEXT:    [[EL1:%.*]] = load i32, ptr [[EL_PTR1]], align 4
+; CHECK-NEXT:    [[BOUND_CHECK1:%.*]] = icmp ult i32 [[EL1]], [[LIMIT]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK1]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
-; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL1]], [[X]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
-; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
-; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL1]]
+; CHECK-NEXT:    store i32 [[IV1]], ptr [[ARR_PTR]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV1]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -76,7 +76,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_void_profile(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
@@ -133,7 +133,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_constants(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 200, 300
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -141,7 +141,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 200
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], 300
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -154,13 +154,13 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], 200
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -200,17 +200,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_degenerate_profile(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -257,17 +257,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_cold(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF8:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -314,17 +314,17 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_01_neg_overflowing_metadata(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_01_neg_overflowing_metadata(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]]
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF9:![0-9]+]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF9]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
@@ -371,7 +371,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_02(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -379,7 +379,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -392,16 +392,16 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -441,7 +441,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_02_inverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -449,7 +449,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp sge i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[GUARDED_US]], label [[COMMON_RET:%.*]], !prof [[PROF3]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp uge i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -462,16 +462,16 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit,
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp sge i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF1]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp uge i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[COMMON_RET]], label [[BACKEDGE]], !prof [[PROF11:![0-9]+]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -511,7 +511,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_03(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -519,7 +519,7 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp slt i32 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]]
 ; CHECK-NEXT:    [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]]
@@ -532,16 +532,16 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -581,7 +581,7 @@ range_check_failed:                               ; preds = %guarded
 define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) {
 ; CHECK-LABEL: @test_04(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META1]]
 ; CHECK-NEXT:    [[INJECTED_COND:%.*]] = icmp ule i32 128, [[X]]
 ; CHECK-NEXT:    br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]]
 ; CHECK:       loop.us:
@@ -589,7 +589,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR_US:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV_US]]
 ; CHECK-NEXT:    [[EL_US:%.*]] = load i8, ptr [[EL_PTR_US]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK_US:%.*]] = icmp slt i8 [[EL_US]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF11]]
 ; CHECK:       guarded.us:
 ; CHECK-NEXT:    [[EL_WIDE_US:%.*]] = zext i8 [[EL_US]] to i32
 ; CHECK-NEXT:    [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_WIDE_US]], [[X]]
@@ -603,17 +603,17 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun
 ; CHECK-NEXT:    [[EL_PTR:%.*]] = getelementptr i8, ptr [[P]], i32 [[IV]]
 ; CHECK-NEXT:    [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4
 ; CHECK-NEXT:    [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0
-; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]]
+; CHECK-NEXT:    br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF11]]
 ; CHECK:       guarded:
 ; CHECK-NEXT:    [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32
 ; CHECK-NEXT:    [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[X]]
-; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]]
+; CHECK-NEXT:    br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF3]]
 ; CHECK:       backedge:
 ; CHECK-NEXT:    [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[EL_WIDE]]
 ; CHECK-NEXT:    store i32 [[IV]], ptr [[ARR_PTR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       common.ret:
 ; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
@@ -651,17 +651,19 @@ range_check_failed:                               ; preds = %guarded
   ret i32 -2
 }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{}
-; CHECK: [[PROF1]] = !{!"branch_weights", i32 100, i32 1}
-; CHECK: [[LOOP2]] = distinct !{!2, !3}
-; CHECK: [[META3:![0-9]+]] = !{!"llvm.loop.unswitch.injection.disable"}
-; CHECK: [[LOOP4]] = distinct !{!4, !3}
-; CHECK: [[PROF5]] = !{!"branch_weights", i32 0, i32 0}
-; CHECK: [[PROF6]] = !{!"branch_weights", i32 2, i32 3}
-; CHECK: [[PROF7]] = !{!"branch_weights", i32 -1, i32 -1000}
-; CHECK: [[LOOP8]] = distinct !{!8, !3}
-; CHECK: [[LOOP9]] = distinct !{!9, !3}
-; CHECK: [[PROF10]] = !{!"branch_weights", i32 1, i32 100}
-; CHECK: [[LOOP11]] = distinct !{!11, !3}
-; CHECK: [[LOOP12]] = distinct !{!12, !3}
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[META1]] = !{}
+; CHECK: [[PROF2]] = !{!"unknown", !"simple-loop-unswitch"}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 100, i32 1}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]}
+; CHECK: [[META5]] = !{!"llvm.loop.unswitch.injection.disable"}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META5]]}
+; CHECK: [[PROF7]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[PROF8]] = !{!"branch_weights", i32 2, i32 3}
+; CHECK: [[PROF9]] = !{!"branch_weights", i32 -1, i32 -1000}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META5]]}
+; CHECK: [[PROF11]] = !{!"branch_weights", i32 1, i32 100}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META5]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]]}
 ;.
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
index 1d8942079ffd8..87161707d9f69 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll
@@ -1,14 +1,14 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
 ; RUN: opt -passes='loop-mssa(simple-loop-unswitch<nontrivial>),verify<loops>' -S < %s | FileCheck %s
 
 declare void @clobber()
 
-define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
+define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) !prof !0 {
 ; CHECK-LABEL: @partial_unswitch_true_successor(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 100
-; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]], !prof [[PROF1:![0-9]+]]
 ; CHECK:       entry.split.us:
 ; CHECK-NEXT:    br label [[LOOP_HEADER_US:%.*]]
 ; CHECK:       loop.header.us:
@@ -19,7 +19,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch.us:
 ; CHECK-NEXT:    [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]]
 ; CHECK-NEXT:    [[IV_NEXT_US]] = add i32 [[IV_US]], 1
-; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]]
+; CHECK-NEXT:    br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !prof [[PROF2:![0-9]+]]
 ; CHECK:       exit.split.us:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       entry.split:
@@ -28,7 +28,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
 ; CHECK-NEXT:    [[LV:%.*]] = load i32, ptr [[PTR]], align 4
 ; CHECK-NEXT:    [[SC:%.*]] = icmp eq i32 [[LV]], 100
-; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]]
+; CHECK-NEXT:    br i1 [[SC]], label [[NOCLOBBER:%.*]], label [[CLOBBER:%.*]], !prof [[PROF1]]
 ; CHECK:       noclobber:
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       clobber:
@@ -37,7 +37,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !prof [[PROF2]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -50,7 +50,7 @@ loop.header:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
   %lv = load i32, ptr %ptr
   %sc = icmp eq i32 %lv, 100
-  br i1 %sc, label %noclobber, label %clobber
+  br i1 %sc, label %noclobber, label %clobber, !prof !1
 
 noclobber:
   br label %loop.latch
@@ -62,7 +62,7 @@ clobber:
 loop.latch:
   %c = icmp ult i32 %iv, %N
   %iv.next = add i32 %iv, 1
-  br i1 %c, label %loop.header, label %exit
+  br i1 %c, label %loop.header, label %exit, !prof !2
 
 exit:
   ret i32 10
@@ -102,7 +102,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -171,7 +171,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -246,7 +246,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) {
 ; CHECK-NEXT:    [[RED_NEXT]] = phi i32 [ [[ADD_5]], [[CLOBBER]] ], [ [[ADD_10]], [[NOCLOBBER]] ]
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -325,7 +325,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -637,7 +637,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit.loopexit.split:
 ; CHECK-NEXT:    br label [[EXIT_LOOPEXIT]]
 ; CHECK:       exit.loopexit:
@@ -713,7 +713,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -784,7 +784,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1073,7 +1073,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1138,7 +1138,7 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32
 ; CHECK-NEXT:    store i32 [[TMP1:%.*]], ptr [[PTR]], align 16
 ; CHECK-NEXT:    br label [[EXITING]]
 ; CHECK:       exiting:
-; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    [[RET_VAL:%.*]] = phi i32 [ 1, [[EXITING]] ]
 ; CHECK-NEXT:    br label [[EXIT]]
@@ -1249,7 +1249,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 %
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1360,7 +1360,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1425,7 +1425,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) {
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[IV]], [[N]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       exit.split:
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
@@ -1456,15 +1456,26 @@ exit:
   ret i32 10
 }
 
-; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]}
-; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"}
-; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[UNSWITCH_PARTIAL_DISABLE]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[UNSWITCH_PARTIAL_DISABLE]]}
+!0 = !{!"function_entry_count", i32 10}
+!1 = !{!"branch_weights", i32 1000, i32 1}
+!2 = !{!"branch_weights", i32 100, i32 3}
+
+;.
+; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1000, i32 1}
+; CHECK: [[PROF2]] = !{!"branch_weights", i32 100, i32 3}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+; CHECK: [[META4]] = !{!"llvm.loop.unswitch.partial.disable"}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META4]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META4]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META4]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META4]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META4]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META4]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META4]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META4]]}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/pr166369.ll b/llvm/test/Transforms/SimplifyCFG/pr166369.ll
new file mode 100644
index 0000000000000..c0a85c0293dd8
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/pr166369.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s
+
+; Make sure we handle full-set ranges correctly.
+define void @test_i1() {
+; CHECK-LABEL: define void @test_i1() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp ugt i1 false, true
+  br label %bb5
+
+bb5:
+  %select = select i1 %icmp, i1 %icmp, i1 false
+  br i1 %select, label %bb5, label %bb6
+
+bb6:
+  ret void
+}
+
+define void @test_i3() {
+; CHECK-LABEL: define void @test_i3() {
+; CHECK-NEXT:  [[BB:.*:]]
+; CHECK-NEXT:    ret void
+;
+bb:
+  %icmp = icmp ugt i3 0, 7
+  br label %bb5
+
+bb5:
+  %select = select i1 %icmp, i1 %icmp, i1 false
+  br i1 %select, label %bb5, label %bb6
+
+bb6:
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
index d1fba91d1e505..169803f7aa012 100644
--- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
+++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
@@ -321,7 +321,7 @@ three:
 !1 = !{!"branch_weights", i32 5, i32 7, i32 11, i32 13, i32 17}
 ;.
 ; CHECK: attributes #[[ATTR0:[0-9]+]] = { optsize }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
 ; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
 ; CHECK: [[PROF1]] = !{!"branch_weights", i32 48, i32 5}
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
similarity index 54%
rename from llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll
rename to llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
index e8b58639c13dd..1ab1b5e8bd838 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-on-const-select.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-on-const.ll
@@ -154,6 +154,132 @@ bees:
   unreachable
 }
 
+define void @pr165179(i1 %cond) {
+; CHECK-LABEL: @pr165179(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB]]
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:
+  tail call void @bees.a() nounwind
+  br label %switchbb
+
+if.else:
+  tail call void @bees.b() nounwind
+  br label %switchbb
+
+switchbb:
+  %cond1 = phi i32 [ 1, %if.else ], [ -1, %if.then ]
+  switch i32 %cond1, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
+define void @switch_remove_dead_case_phi(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @switch_remove_dead_case_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND1:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br i1 [[COND2:%.*]], label [[SWITCHBB:%.*]], label [[IF_ELSE]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 3, [[ENTRY:%.*]] ], [ -1, [[IF_THEN]] ]
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[SWITCHBB]]
+; CHECK:       switchbb:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[PHI]], [[IF_ELSE]] ], [ 5, [[IF_THEN]] ]
+; CHECK-NEXT:    [[COND3:%.*]] = icmp eq i32 [[COND]], -1
+; CHECK-NEXT:    br i1 [[COND3]], label [[EXIT:%.*]], label [[DEFAULT:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       default:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  br i1 %cond1, label %if.then, label %if.else
+
+if.then:
+  tail call void @bees.a() nounwind
+  br i1 %cond2, label %switchbb, label %if.else
+
+if.else:
+  %phi = phi i32 [ 3, %entry ], [ -1, %if.then ]
+  tail call void @bees.b() nounwind
+  br label %switchbb
+
+switchbb:
+  %cond = phi i32 [ %phi, %if.else ], [ 5, %if.then ]
+  switch i32 %cond, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
+define void @switch_remove_dead_case_select(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @switch_remove_dead_case_select(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X:%.*]] = select i1 [[COND1:%.*]], i32 -1, i32 3
+; CHECK-NEXT:    [[Y:%.*]] = select i1 [[COND2:%.*]], i32 [[X]], i32 5
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[Y]], -1
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT:%.*]], label [[DEFAULT:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       exit:
+; CHECK-NEXT:    tail call void @bees.a() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       default:
+; CHECK-NEXT:    tail call void @bees.b() #[[ATTR0]]
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %x = select i1 %cond1, i32 -1, i32 3
+  %y = select i1 %cond2, i32 %x, i32 5
+  switch i32 %y, label %default [
+  i32 1, label %exit
+  i32 -1, label %exit
+  ]
+
+exit:
+  tail call void @bees.a() nounwind
+  ret void
+
+default:
+  tail call void @bees.b() nounwind
+  ret void
+}
+
 declare void @llvm.trap() nounwind noreturn
 declare void @bees.a() nounwind
 declare void @bees.b() nounwind
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-umin.ll b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll
new file mode 100644
index 0000000000000..44665365dc222
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch-umin.ll
@@ -0,0 +1,246 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
+
+define void @switch_replace_default(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE0:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ], !prof [[PROF0:![0-9]+]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ], !prof !0
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @switch_replace_default_and_remove_dead_cases(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default_and_remove_dead_cases(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 4, label %case4
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case4:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @switch_replace_default_when_holes(i32 %x) {
+; CHECK-LABEL: define void @switch_replace_default_when_holes(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[X]], label %[[COMMON_RET:.*]] [
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %unreachable [
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @do_not_switch_replace_default(i32 %x, i32 %y) {
+; CHECK-LABEL: define void @do_not_switch_replace_default(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 [[Y]])
+; CHECK-NEXT:    switch i32 [[MIN]], label %[[UNREACHABLE:.*]] [
+; CHECK-NEXT:      i32 0, label %[[CASE0:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:      i32 3, label %[[COMMON_RET:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[UNREACHABLE]]:
+; CHECK-NEXT:    unreachable
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 %y)
+  switch i32 %min, label %unreachable [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  ]
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+unreachable:
+  unreachable
+}
+
+define void @do_not_replace_switch_default_but_remove_dead_cases(i32 %x) {
+; CHECK-LABEL: define void @do_not_replace_switch_default_but_remove_dead_cases(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[X]], i32 3)
+; CHECK-NEXT:    switch i32 [[MIN]], label %[[CASE0:.*]] [
+; CHECK-NEXT:      i32 3, label %[[COMMON_RET:.*]]
+; CHECK-NEXT:      i32 1, label %[[CASE1:.*]]
+; CHECK-NEXT:      i32 2, label %[[CASE2:.*]]
+; CHECK-NEXT:    ]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[CASE0]]:
+; CHECK-NEXT:    call void @a()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE1]]:
+; CHECK-NEXT:    call void @b()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[CASE2]]:
+; CHECK-NEXT:    call void @c()
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %min = call i32 @llvm.umin.i32(i32 %x, i32 3)
+  switch i32 %min, label %case0 [ ; default is reachable, therefore simplification not triggered
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case2
+  i32 3, label %case3
+  i32 4, label %case4
+  ]
+
+case0:
+  call void @a()
+  ret void
+
+case1:
+  call void @b()
+  ret void
+
+case2:
+  call void @c()
+  ret void
+
+case3:
+  ret void
+
+case4:
+  call void @d()
+  ret void
+
+}
+
+
+!0 = !{!"branch_weights", i32 1, i32 2, i32 3, i32 99, i32 5}
+;.
+; CHECK: [[PROF0]] = !{!"branch_weights", i32 5, i32 2, i32 3, i32 99}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
index f8bcbc057a7ae..428c18fc18e3d 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
@@ -221,6 +221,7 @@ define i1 @pr88607() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 false, i32 4, i32 1
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 false, i32 2, i32 [[COND]]
+; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i32 [[SPEC_SELECT]], 1
 ; CHECK-NEXT:    ret i1 false
 ;
 entry:
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
index 88a729b7d941a..4de5ea948ed27 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_undef.ll
@@ -5,12 +5,11 @@
 define void @f6() #0 {
 ; CHECK-LABEL: @f6(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
-; CHECK:       for.cond.i:
+; CHECK-NEXT:    br label [[F1_EXIT_I:%.*]]
+; CHECK:       f1.exit.i:
 ; CHECK-NEXT:    [[TOBOOL7_I:%.*]] = icmp ne i16 1, 0
-; CHECK-NEXT:    br label [[FOR_COND_I]]
+; CHECK-NEXT:    br label [[F1_EXIT_I]]
 ;
-
 entry:
   br label %for.cond.i
 
diff --git a/llvm/test/Transforms/StructurizeCFG/callbr.ll b/llvm/test/Transforms/StructurizeCFG/callbr.ll
new file mode 100644
index 0000000000000..42f95194980d4
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/callbr.ll
@@ -0,0 +1,235 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=structurizecfg %s -o - | FileCheck %s
+
+; Structurize as usual, but don't tear callbr and its destination blocks apart.
+;
+; Note: currently, callbr blocks and their corresponding target blocks
+; themselves are not handled by the structurizer.* If the CFG turns out to be
+; unstructured at the end, the CFG lowering (si-annotate-control-flow) will
+; detect this. For the currently intended use cases of callbr in the context of
+; the AMDGPU backend, this is not a limitation (cf.
+; https://discourse.llvm.org/t/rfc-add-callbr-intrinsic-support/86087).
+;
+; Note 2: while callbr and its targets remain untouched, everything else is
+; handled as usual, even if it is nested in a callbr region.
+;
+; *FIXME: this will be fixed in the future. Callbr can be handled as follows:
+; Input IR:
+; ```
+; define void @foo_callbr() {
+;   callbr void asm "", "!i"() to label %fallthrough [label %indirect, ...]
+; fallthrough:
+;   br label %exit
+; indirect:
+;   br label %exit
+; ...
+; exit:
+;   ret void
+; }
+; ```
+;
+; Output IR:
+; ```
+; define void @foo_callbr() {
+;   callbr void asm "", "!i"()
+;          to label %fallthrough [label %fake.indirect, label %fake.indirect1, label %fake.indirect2, ...]
+; fake.indirect:                                    ; preds = %0
+;   br label %Flow
+; fake.indirect1:                                   ; preds = %0
+;   br label %Flow
+; fake.indirect2:                                   ; preds = %0
+;   br label %Flow
+; ...
+; Flow:                                             ; preds = %fallthrough, %fake.indirect[0-N]
+;   %1 = phi i1 [ false, %fallthrough ], [ true, %fake.indirect ], [ false, %fake.indirect[1-N] ]
+;   br i1 %1, label %indirect, label %Flow1
+; Flow1:                                            ; preds = %Flow, %indirect
+;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect1 ], [ false, %indirect ]
+;   br i1 %2, label %indirect1, label %Flow2
+; Flow2:                                            ; preds = %Flow, %indirect1
+;   %2 = phi i1 [ false, %Flow], [ true, %fake.indirect2 ], [ false, %indirect1 ]
+;   br i1 %2, label %indirect2, label %Flow3
+; ...
+; fallthrough:                                      ; preds = %0
+;   br label %Flow
+; indirect:                                         ; preds = %Flow
+;   br label %Flow1
+; indirect1:                                        ; preds = %Flow1
+;   br label %Flow2
+; indirect2:                                        : preds = %Flow2
+;   br label %Flow3
+; ...
+; exit:                                             ; preds = %indirectN, %FlowN
+;   ret void
+; }
+; ```
+;
+; Output IR as ASCII-art:
+;          %0
+; ---------------------
+; |     |     |     |
+; v     v     v     v
+; f    f.i   f.i1  f.i2
+; |     |     |     |
+; v     v     v     v
+; ---------------------
+;        %Flow
+;          |   \
+;          |    %indirect
+;          |   /
+;       %Flow1
+;          |   \
+;          |    %indirect1
+;          |   /
+;       %Flow2
+;          |   \
+;          |    %indirect2
+;          |   /
+;        %exit
+;
+
+; Only callbr, nothing to do.
+define void @callbr_simple() {
+; CHECK-LABEL: define void @callbr_simple() {
+; CHECK-NEXT:  [[CALLBR:.*:]]
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br label %exit
+indirect:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr nested in non-callbr: non-callbr is transformed
+define void @callbr_in_non_callbr(i1 %c) {
+; CHECK-LABEL: define void @callbr_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[NOCALLBR]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[EXIT:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br label %exit
+indirect:
+  br label %exit
+nocallbr:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr parent of non-callbr: non-callbr is transformed
+define void @non_callbr_in_callbr(i1 %c) {
+; CHECK-LABEL: define void @non_callbr_in_callbr(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[FALLTHROUGH2:.*]], label %[[FLOW:.*]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FALLTHROUGH2]] ], [ true, %[[INDIRECT]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[FALLTHROUGH1:.*]], label %[[FLOW1:.*]]
+; CHECK:       [[FALLTHROUGH1]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[FALLTHROUGH2]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[INDIRECT1:.*:]]
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[FLOW1]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %c, label %fallthrough1, label %fallthrough2
+fallthrough1:
+  br label %exit
+fallthrough2:
+  br label %exit
+indirect:
+  br label %exit
+exit:
+  ret void
+}
+
+; Callbr surrounded by non-callbr: all three regular branches are handled
+; correctly
+define void @callbr_nested_in_non_callbr(i1 %c, i1 %d, i1 %e, i1 %f) {
+; CHECK-LABEL: define void @callbr_nested_in_non_callbr(
+; CHECK-SAME: i1 [[C:%.*]], i1 [[D:%.*]], i1 [[E:%.*]], i1 [[F:%.*]]) {
+; CHECK-NEXT:    [[C_INV:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    br i1 [[C_INV]], label %[[NOCALLBR:.*]], label %[[FLOW3:.*]]
+; CHECK:       [[FLOW3]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ false, %[[FLOW:.*]] ], [ true, [[TMP0:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[CALLBR:.*]], label %[[RET:.*]]
+; CHECK:       [[CALLBR]]:
+; CHECK-NEXT:    callbr void asm "", "!i"()
+; CHECK-NEXT:            to label %[[INDIRECT:.*]] [label %indirect]
+; CHECK:       [[INDIRECT]]:
+; CHECK-NEXT:    br i1 [[D]], label %[[FALLTHROUGH1:.*]], label %[[FLOW2:.*]]
+; CHECK:       [[FALLTHROUGH1]]:
+; CHECK-NEXT:    br label %[[FLOW2]]
+; CHECK:       [[INDIRECT2:.*:]]
+; CHECK-NEXT:    br i1 [[E]], label %[[INDIRECT1:.*]], label %[[FLOW1:.*]]
+; CHECK:       [[INDIRECT1]]:
+; CHECK-NEXT:    br label %[[FLOW1]]
+; CHECK:       [[NOCALLBR]]:
+; CHECK-NEXT:    br i1 [[F]], label %[[NOCALLBR1:.*]], label %[[FLOW]]
+; CHECK:       [[NOCALLBR1]]:
+; CHECK-NEXT:    br label %[[FLOW]]
+; CHECK:       [[FLOW]]:
+; CHECK-NEXT:    br label %[[FLOW3]]
+; CHECK:       [[FLOW1]]:
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[FLOW2]]:
+; CHECK-NEXT:    br label %[[RET]]
+; CHECK:       [[RET]]:
+; CHECK-NEXT:    ret void
+;
+  br i1 %c, label %callbr, label %nocallbr
+callbr:
+  callbr void asm "", "!i"() to label %fallthrough [label %indirect]
+fallthrough:
+  br i1 %d, label %fallthrough1, label %ret
+fallthrough1:
+  br label %ret
+indirect:
+  br i1 %e, label %indirect1, label %ret
+indirect1:
+  br label %ret
+nocallbr:
+  br i1 %f, label %nocallbr1, label %ret
+nocallbr1:
+  br label %ret
+ret:
+  ret void
+}
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
index ee3a0539bf300..c005316f07f06 100644
--- a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/basic.ll
@@ -11,5 +11,9 @@ define float @sinf(float %x) {
 }
 
 ; CHECK: declare void @acosf(...)
+
+; CHECK: declare nofpclass(ninf nsub nnorm) float @sqrtf(float) [[SQRT_ATTRS:#[0-9]+]]
+; CHECK: declare nofpclass(ninf nsub nnorm) double @sqrt(double) [[SQRT_ATTRS:#[0-9]+]]
+
 ; CHECK: declare void @__umodti3(...)
 
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
new file mode 100644
index 0000000000000..ffbf11d4106dc
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/merge_attributes.ll
@@ -0,0 +1,11 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define noundef nofpclass(nan) float @sqrtf(float %x) "foo" {
+  %ret = call float asm "; $0 = sqrt($1)", "=r,r"(float %x)
+  ret float %ret
+}
+
+; FIXME: Individual fields of nofpclass not merged
+; CHECK: define noundef nofpclass(ninf nsub nnorm) float @sqrtf(float %x) [[SQRT_ATTR:#[0-9]+]] {
+
+; CHECK: attributes [[SQRT_ATTR]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) "foo" }
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
new file mode 100644
index 0000000000000..f0f09e97d9dba
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/sincos_stret.ll
@@ -0,0 +1,23 @@
+; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,X64 %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=arm64-apple-macos10.9 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefixes=CHECK,STRUCT %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %}
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=thumbv7-apple-ios7 < %s | FileCheck -check-prefix=SRET %s %}
+
+; RUN: %if arm-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=armv7-apple-ios6 < %s | FileCheck -check-prefix=NONE %s %}
+; RUN: %if x86-registered-target %{ opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.8 < %s | FileCheck -check-prefix=NONE %s %}
+
+; X64: declare <2 x float> @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
+; X64: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; STRUCT: declare { float, float } @__sincosf_stret(float) [[SINCOS_ATTRS:#[0-9]+]]
+; STRUCT: declare { double, double } @__sincos_stret(double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; SRET: declare void @__sincosf_stret(ptr sret({ float, float }) align 4, float) [[SINCOS_ATTRS:#[0-9]+]]
+; SRET: declare void @__sincos_stret(ptr sret({ double, double }) align 4, double) [[SINCOS_ATTRS:#[0-9]+]]
+
+; CHECK: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(errnomem: write) }
+; SRET: attributes [[SINCOS_ATTRS]] = { nocallback nofree nosync nounwind willreturn memory(argmem: write, errnomem: write) }
+
+; NONE-NOT: __sincos_stret
+; NONE-NOT: __sincosf_stret
diff --git a/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
new file mode 100644
index 0000000000000..2451010df5b75
--- /dev/null
+++ b/llvm/test/Transforms/Util/DeclareRuntimeLibcalls/wrong_declaration.ll
@@ -0,0 +1,21 @@
+; RUN: opt -S -passes=declare-runtime-libcalls -mtriple=x86_64-apple-macos10.9 < %s | FileCheck %s
+
+; Make sure there is no crash if there are definitions or declarations
+; with the wrong type signature.
+
+; CHECK: define void @sqrtf() {
+define void @sqrtf() {
+  ret void
+}
+
+; CHECK: define float @sqrt(float %0) {
+define float @sqrt(float) {
+  ret float 0.0
+}
+
+; CHECK: declare double @__sincos_stret(double)
+declare double @__sincos_stret(double)
+
+; CHECK: declare { float, float } @__sincosf_stret(float)
+declare { float, float } @__sincosf_stret(float)
+
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
index 60700412686ea..e7b11cdf8475e 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/ext-extract.ll
@@ -346,3 +346,189 @@ entry:
   call void @use.i32(i32 %ext.3)
   ret void
 }
+
+define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_all_lanes_used_no_freeze(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[SRC]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 24
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP0]], 16
+; CHECK-NEXT:    [[TMP3:%.*]] = and i32 [[TMP2]], 255
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP0]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP3]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
+
+define noundef i32 @zext_v4i8_not_all_lanes_used(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_not_all_lanes_used(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 24
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP0]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 255
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP0]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP1]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.3
+  ret i32 %add2
+}
+
+define i32 @zext_v4i8_all_lanes_used_no_ub(<4 x i8> %src) {
+; CHECK-LABEL: define i32 @zext_v4i8_all_lanes_used_no_ub(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
+
+define noundef i32 @zext_v4i8_extracts_different_blocks(<4 x i8> %src, i1 %cond) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_extracts_different_blocks(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP4]], %[[THEN]] ], [ [[TMP2]], %[[ELSE]] ]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[PHI]]
+; CHECK-NEXT:    ret i32 [[ADD2]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  br i1 %cond, label %then, label %else
+
+then:
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  br label %exit
+
+else:
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+  br label %exit
+
+exit:
+  %phi = phi i32 [ %ext.2, %then ], [ %ext.3, %else ]
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %phi
+  ret i32 %add2
+}
+
+
+declare void @may_throw() willreturn
+
+define noundef i32 @zext_v4i8_throwing_call_between(<4 x i8> %src) {
+; CHECK-LABEL: define noundef i32 @zext_v4i8_throwing_call_between(
+; CHECK-SAME: <4 x i8> [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze <4 x i8> [[SRC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x i8> [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP1]], 16
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], 255
+; CHECK-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP1]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[TMP5]], 255
+; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[TMP1]], 255
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg <4 x i8> [[SRC]] to <4 x i32>
+; CHECK-NEXT:    [[EXT_0:%.*]] = extractelement <4 x i32> [[EXT]], i64 0
+; CHECK-NEXT:    [[EXT_1:%.*]] = extractelement <4 x i32> [[EXT]], i64 1
+; CHECK-NEXT:    [[EXT_2:%.*]] = extractelement <4 x i32> [[EXT]], i64 2
+; CHECK-NEXT:    call void @may_throw()
+; CHECK-NEXT:    [[EXT_3:%.*]] = extractelement <4 x i32> [[EXT]], i64 3
+; CHECK-NEXT:    [[ADD1:%.*]] = add i32 [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[ADD1]], [[TMP4]]
+; CHECK-NEXT:    [[ADD3:%.*]] = add i32 [[ADD2]], [[TMP2]]
+; CHECK-NEXT:    ret i32 [[ADD3]]
+;
+entry:
+  %ext = zext nneg <4 x i8> %src to <4 x i32>
+  %ext.0 = extractelement <4 x i32> %ext, i64 0
+  %ext.1 = extractelement <4 x i32> %ext, i64 1
+  %ext.2 = extractelement <4 x i32> %ext, i64 2
+  call void @may_throw()
+  %ext.3 = extractelement <4 x i32> %ext, i64 3
+  %add1 = add i32 %ext.0, %ext.1
+  %add2 = add i32 %add1, %ext.2
+  %add3 = add i32 %add2, %ext.3
+  ret i32 %add3
+}
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll
new file mode 100644
index 0000000000000..4b551fad5b43a
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/extract-insert-chain-to-shuffles.ll
@@ -0,0 +1,567 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1100 -passes=vector-combine < %s | FileCheck -check-prefix=OPT %s
+
+; Generated from amdgpu-promote-alloca on array of vectors
+; VectorCombiner should recognize chain of extract-insert vectors
+; and turn them into one or two shuffles
+define amdgpu_kernel void @extract_insert_chain_to_shuffles(<16 x i8> %in, <16 x i8> %add, ptr addrspace(3) %out) #0 {
+; OPT-LABEL: define amdgpu_kernel void @extract_insert_chain_to_shuffles(
+; OPT-SAME: <16 x i8> [[IN:%.*]], <16 x i8> [[ADD:%.*]], ptr addrspace(3) [[OUT:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[ALLOCA:%.*]] = freeze <128 x i8> poison
+; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP1:%.*]] = shufflevector <128 x i8> [[ALLOCA]], <128 x i8> [[TMP0]], <128 x i32> <i32 128, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP2:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP3:%.*]] = insertelement <128 x i8> [[TMP1]], i8 [[TMP2]], i32 1
+; OPT-NEXT:    [[TMP4:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP5:%.*]] = insertelement <128 x i8> [[TMP3]], i8 [[TMP4]], i32 2
+; OPT-NEXT:    [[TMP6:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP7:%.*]] = insertelement <128 x i8> [[TMP5]], i8 [[TMP6]], i32 3
+; OPT-NEXT:    [[TMP8:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP9:%.*]] = insertelement <128 x i8> [[TMP7]], i8 [[TMP8]], i32 4
+; OPT-NEXT:    [[TMP10:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP11:%.*]] = insertelement <128 x i8> [[TMP9]], i8 [[TMP10]], i32 5
+; OPT-NEXT:    [[TMP12:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP13:%.*]] = insertelement <128 x i8> [[TMP11]], i8 [[TMP12]], i32 6
+; OPT-NEXT:    [[TMP14:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP15:%.*]] = insertelement <128 x i8> [[TMP13]], i8 [[TMP14]], i32 7
+; OPT-NEXT:    [[TMP16:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP17:%.*]] = insertelement <128 x i8> [[TMP15]], i8 [[TMP16]], i32 8
+; OPT-NEXT:    [[TMP18:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP19:%.*]] = insertelement <128 x i8> [[TMP17]], i8 [[TMP18]], i32 9
+; OPT-NEXT:    [[TMP20:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP21:%.*]] = insertelement <128 x i8> [[TMP19]], i8 [[TMP20]], i32 10
+; OPT-NEXT:    [[TMP22:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP23:%.*]] = insertelement <128 x i8> [[TMP21]], i8 [[TMP22]], i32 11
+; OPT-NEXT:    [[TMP24:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP25:%.*]] = insertelement <128 x i8> [[TMP23]], i8 [[TMP24]], i32 12
+; OPT-NEXT:    [[TMP26:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP27:%.*]] = insertelement <128 x i8> [[TMP25]], i8 [[TMP26]], i32 13
+; OPT-NEXT:    [[TMP28:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP29:%.*]] = insertelement <128 x i8> [[TMP27]], i8 [[TMP28]], i32 14
+; OPT-NEXT:    [[TMP30:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP31:%.*]] = insertelement <128 x i8> [[TMP29]], i8 [[TMP30]], i32 15
+; OPT-NEXT:    [[TMP32:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP33:%.*]] = shufflevector <128 x i8> [[TMP31]], <128 x i8> [[TMP32]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 128, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP34:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP35:%.*]] = insertelement <128 x i8> [[TMP33]], i8 [[TMP34]], i32 17
+; OPT-NEXT:    [[TMP36:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP37:%.*]] = insertelement <128 x i8> [[TMP35]], i8 [[TMP36]], i32 18
+; OPT-NEXT:    [[TMP38:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP39:%.*]] = insertelement <128 x i8> [[TMP37]], i8 [[TMP38]], i32 19
+; OPT-NEXT:    [[TMP40:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP41:%.*]] = insertelement <128 x i8> [[TMP39]], i8 [[TMP40]], i32 20
+; OPT-NEXT:    [[TMP42:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP43:%.*]] = insertelement <128 x i8> [[TMP41]], i8 [[TMP42]], i32 21
+; OPT-NEXT:    [[TMP44:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP45:%.*]] = insertelement <128 x i8> [[TMP43]], i8 [[TMP44]], i32 22
+; OPT-NEXT:    [[TMP46:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP47:%.*]] = insertelement <128 x i8> [[TMP45]], i8 [[TMP46]], i32 23
+; OPT-NEXT:    [[TMP48:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP49:%.*]] = insertelement <128 x i8> [[TMP47]], i8 [[TMP48]], i32 24
+; OPT-NEXT:    [[TMP50:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP51:%.*]] = insertelement <128 x i8> [[TMP49]], i8 [[TMP50]], i32 25
+; OPT-NEXT:    [[TMP52:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP53:%.*]] = insertelement <128 x i8> [[TMP51]], i8 [[TMP52]], i32 26
+; OPT-NEXT:    [[TMP54:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP55:%.*]] = insertelement <128 x i8> [[TMP53]], i8 [[TMP54]], i32 27
+; OPT-NEXT:    [[TMP56:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP57:%.*]] = insertelement <128 x i8> [[TMP55]], i8 [[TMP56]], i32 28
+; OPT-NEXT:    [[TMP58:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP59:%.*]] = insertelement <128 x i8> [[TMP57]], i8 [[TMP58]], i32 29
+; OPT-NEXT:    [[TMP60:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP61:%.*]] = insertelement <128 x i8> [[TMP59]], i8 [[TMP60]], i32 30
+; OPT-NEXT:    [[TMP62:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP63:%.*]] = insertelement <128 x i8> [[TMP61]], i8 [[TMP62]], i32 31
+; OPT-NEXT:    [[TMP64:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP65:%.*]] = shufflevector <128 x i8> [[TMP63]], <128 x i8> [[TMP64]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 128, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP66:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP67:%.*]] = insertelement <128 x i8> [[TMP65]], i8 [[TMP66]], i32 33
+; OPT-NEXT:    [[TMP68:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP69:%.*]] = insertelement <128 x i8> [[TMP67]], i8 [[TMP68]], i32 34
+; OPT-NEXT:    [[TMP70:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP71:%.*]] = insertelement <128 x i8> [[TMP69]], i8 [[TMP70]], i32 35
+; OPT-NEXT:    [[TMP72:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP73:%.*]] = insertelement <128 x i8> [[TMP71]], i8 [[TMP72]], i32 36
+; OPT-NEXT:    [[TMP74:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP75:%.*]] = insertelement <128 x i8> [[TMP73]], i8 [[TMP74]], i32 37
+; OPT-NEXT:    [[TMP76:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP77:%.*]] = insertelement <128 x i8> [[TMP75]], i8 [[TMP76]], i32 38
+; OPT-NEXT:    [[TMP78:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP79:%.*]] = insertelement <128 x i8> [[TMP77]], i8 [[TMP78]], i32 39
+; OPT-NEXT:    [[TMP80:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP81:%.*]] = insertelement <128 x i8> [[TMP79]], i8 [[TMP80]], i32 40
+; OPT-NEXT:    [[TMP82:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP83:%.*]] = insertelement <128 x i8> [[TMP81]], i8 [[TMP82]], i32 41
+; OPT-NEXT:    [[TMP84:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP85:%.*]] = insertelement <128 x i8> [[TMP83]], i8 [[TMP84]], i32 42
+; OPT-NEXT:    [[TMP86:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP87:%.*]] = insertelement <128 x i8> [[TMP85]], i8 [[TMP86]], i32 43
+; OPT-NEXT:    [[TMP88:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP89:%.*]] = insertelement <128 x i8> [[TMP87]], i8 [[TMP88]], i32 44
+; OPT-NEXT:    [[TMP90:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP91:%.*]] = insertelement <128 x i8> [[TMP89]], i8 [[TMP90]], i32 45
+; OPT-NEXT:    [[TMP92:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP93:%.*]] = insertelement <128 x i8> [[TMP91]], i8 [[TMP92]], i32 46
+; OPT-NEXT:    [[TMP94:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP95:%.*]] = insertelement <128 x i8> [[TMP93]], i8 [[TMP94]], i32 47
+; OPT-NEXT:    [[TMP96:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP97:%.*]] = shufflevector <128 x i8> [[TMP95]], <128 x i8> [[TMP96]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 128, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP98:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP99:%.*]] = insertelement <128 x i8> [[TMP97]], i8 [[TMP98]], i32 49
+; OPT-NEXT:    [[TMP100:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP101:%.*]] = insertelement <128 x i8> [[TMP99]], i8 [[TMP100]], i32 50
+; OPT-NEXT:    [[TMP102:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP103:%.*]] = insertelement <128 x i8> [[TMP101]], i8 [[TMP102]], i32 51
+; OPT-NEXT:    [[TMP104:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP105:%.*]] = insertelement <128 x i8> [[TMP103]], i8 [[TMP104]], i32 52
+; OPT-NEXT:    [[TMP106:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP107:%.*]] = insertelement <128 x i8> [[TMP105]], i8 [[TMP106]], i32 53
+; OPT-NEXT:    [[TMP108:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP109:%.*]] = insertelement <128 x i8> [[TMP107]], i8 [[TMP108]], i32 54
+; OPT-NEXT:    [[TMP110:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP111:%.*]] = insertelement <128 x i8> [[TMP109]], i8 [[TMP110]], i32 55
+; OPT-NEXT:    [[TMP112:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP113:%.*]] = insertelement <128 x i8> [[TMP111]], i8 [[TMP112]], i32 56
+; OPT-NEXT:    [[TMP114:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP115:%.*]] = insertelement <128 x i8> [[TMP113]], i8 [[TMP114]], i32 57
+; OPT-NEXT:    [[TMP116:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP117:%.*]] = insertelement <128 x i8> [[TMP115]], i8 [[TMP116]], i32 58
+; OPT-NEXT:    [[TMP118:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP119:%.*]] = insertelement <128 x i8> [[TMP117]], i8 [[TMP118]], i32 59
+; OPT-NEXT:    [[TMP120:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP121:%.*]] = insertelement <128 x i8> [[TMP119]], i8 [[TMP120]], i32 60
+; OPT-NEXT:    [[TMP122:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP123:%.*]] = insertelement <128 x i8> [[TMP121]], i8 [[TMP122]], i32 61
+; OPT-NEXT:    [[TMP124:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP125:%.*]] = insertelement <128 x i8> [[TMP123]], i8 [[TMP124]], i32 62
+; OPT-NEXT:    [[TMP126:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP127:%.*]] = insertelement <128 x i8> [[TMP125]], i8 [[TMP126]], i32 63
+; OPT-NEXT:    [[TMP128:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP129:%.*]] = shufflevector <128 x i8> [[TMP127]], <128 x i8> [[TMP128]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 128, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP130:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP131:%.*]] = insertelement <128 x i8> [[TMP129]], i8 [[TMP130]], i32 65
+; OPT-NEXT:    [[TMP132:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP133:%.*]] = insertelement <128 x i8> [[TMP131]], i8 [[TMP132]], i32 66
+; OPT-NEXT:    [[TMP134:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP135:%.*]] = insertelement <128 x i8> [[TMP133]], i8 [[TMP134]], i32 67
+; OPT-NEXT:    [[TMP136:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP137:%.*]] = insertelement <128 x i8> [[TMP135]], i8 [[TMP136]], i32 68
+; OPT-NEXT:    [[TMP138:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP139:%.*]] = insertelement <128 x i8> [[TMP137]], i8 [[TMP138]], i32 69
+; OPT-NEXT:    [[TMP140:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP141:%.*]] = insertelement <128 x i8> [[TMP139]], i8 [[TMP140]], i32 70
+; OPT-NEXT:    [[TMP142:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP143:%.*]] = insertelement <128 x i8> [[TMP141]], i8 [[TMP142]], i32 71
+; OPT-NEXT:    [[TMP144:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP145:%.*]] = insertelement <128 x i8> [[TMP143]], i8 [[TMP144]], i32 72
+; OPT-NEXT:    [[TMP146:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP147:%.*]] = insertelement <128 x i8> [[TMP145]], i8 [[TMP146]], i32 73
+; OPT-NEXT:    [[TMP148:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP149:%.*]] = insertelement <128 x i8> [[TMP147]], i8 [[TMP148]], i32 74
+; OPT-NEXT:    [[TMP150:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP151:%.*]] = insertelement <128 x i8> [[TMP149]], i8 [[TMP150]], i32 75
+; OPT-NEXT:    [[TMP152:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP153:%.*]] = insertelement <128 x i8> [[TMP151]], i8 [[TMP152]], i32 76
+; OPT-NEXT:    [[TMP154:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP155:%.*]] = insertelement <128 x i8> [[TMP153]], i8 [[TMP154]], i32 77
+; OPT-NEXT:    [[TMP156:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP157:%.*]] = insertelement <128 x i8> [[TMP155]], i8 [[TMP156]], i32 78
+; OPT-NEXT:    [[TMP158:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP159:%.*]] = insertelement <128 x i8> [[TMP157]], i8 [[TMP158]], i32 79
+; OPT-NEXT:    [[TMP160:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP161:%.*]] = shufflevector <128 x i8> [[TMP159]], <128 x i8> [[TMP160]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 128, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP162:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP163:%.*]] = insertelement <128 x i8> [[TMP161]], i8 [[TMP162]], i32 81
+; OPT-NEXT:    [[TMP164:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP165:%.*]] = insertelement <128 x i8> [[TMP163]], i8 [[TMP164]], i32 82
+; OPT-NEXT:    [[TMP166:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP167:%.*]] = insertelement <128 x i8> [[TMP165]], i8 [[TMP166]], i32 83
+; OPT-NEXT:    [[TMP168:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP169:%.*]] = insertelement <128 x i8> [[TMP167]], i8 [[TMP168]], i32 84
+; OPT-NEXT:    [[TMP170:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP171:%.*]] = insertelement <128 x i8> [[TMP169]], i8 [[TMP170]], i32 85
+; OPT-NEXT:    [[TMP172:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP173:%.*]] = insertelement <128 x i8> [[TMP171]], i8 [[TMP172]], i32 86
+; OPT-NEXT:    [[TMP174:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP175:%.*]] = insertelement <128 x i8> [[TMP173]], i8 [[TMP174]], i32 87
+; OPT-NEXT:    [[TMP176:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP177:%.*]] = insertelement <128 x i8> [[TMP175]], i8 [[TMP176]], i32 88
+; OPT-NEXT:    [[TMP178:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP179:%.*]] = insertelement <128 x i8> [[TMP177]], i8 [[TMP178]], i32 89
+; OPT-NEXT:    [[TMP180:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP181:%.*]] = insertelement <128 x i8> [[TMP179]], i8 [[TMP180]], i32 90
+; OPT-NEXT:    [[TMP182:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP183:%.*]] = insertelement <128 x i8> [[TMP181]], i8 [[TMP182]], i32 91
+; OPT-NEXT:    [[TMP184:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP185:%.*]] = insertelement <128 x i8> [[TMP183]], i8 [[TMP184]], i32 92
+; OPT-NEXT:    [[TMP186:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP187:%.*]] = insertelement <128 x i8> [[TMP185]], i8 [[TMP186]], i32 93
+; OPT-NEXT:    [[TMP188:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP189:%.*]] = insertelement <128 x i8> [[TMP187]], i8 [[TMP188]], i32 94
+; OPT-NEXT:    [[TMP190:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP191:%.*]] = insertelement <128 x i8> [[TMP189]], i8 [[TMP190]], i32 95
+; OPT-NEXT:    [[TMP192:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP193:%.*]] = shufflevector <128 x i8> [[TMP191]], <128 x i8> [[TMP192]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 128, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP194:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP195:%.*]] = insertelement <128 x i8> [[TMP193]], i8 [[TMP194]], i32 97
+; OPT-NEXT:    [[TMP196:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP197:%.*]] = insertelement <128 x i8> [[TMP195]], i8 [[TMP196]], i32 98
+; OPT-NEXT:    [[TMP198:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP199:%.*]] = insertelement <128 x i8> [[TMP197]], i8 [[TMP198]], i32 99
+; OPT-NEXT:    [[TMP200:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP201:%.*]] = insertelement <128 x i8> [[TMP199]], i8 [[TMP200]], i32 100
+; OPT-NEXT:    [[TMP202:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP203:%.*]] = insertelement <128 x i8> [[TMP201]], i8 [[TMP202]], i32 101
+; OPT-NEXT:    [[TMP204:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP205:%.*]] = insertelement <128 x i8> [[TMP203]], i8 [[TMP204]], i32 102
+; OPT-NEXT:    [[TMP206:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP207:%.*]] = insertelement <128 x i8> [[TMP205]], i8 [[TMP206]], i32 103
+; OPT-NEXT:    [[TMP208:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP209:%.*]] = insertelement <128 x i8> [[TMP207]], i8 [[TMP208]], i32 104
+; OPT-NEXT:    [[TMP210:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP211:%.*]] = insertelement <128 x i8> [[TMP209]], i8 [[TMP210]], i32 105
+; OPT-NEXT:    [[TMP212:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP213:%.*]] = insertelement <128 x i8> [[TMP211]], i8 [[TMP212]], i32 106
+; OPT-NEXT:    [[TMP214:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP215:%.*]] = insertelement <128 x i8> [[TMP213]], i8 [[TMP214]], i32 107
+; OPT-NEXT:    [[TMP216:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP217:%.*]] = insertelement <128 x i8> [[TMP215]], i8 [[TMP216]], i32 108
+; OPT-NEXT:    [[TMP218:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP219:%.*]] = insertelement <128 x i8> [[TMP217]], i8 [[TMP218]], i32 109
+; OPT-NEXT:    [[TMP220:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP221:%.*]] = insertelement <128 x i8> [[TMP219]], i8 [[TMP220]], i32 110
+; OPT-NEXT:    [[TMP222:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP223:%.*]] = insertelement <128 x i8> [[TMP221]], i8 [[TMP222]], i32 111
+; OPT-NEXT:    [[TMP224:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <128 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP225:%.*]] = shufflevector <128 x i8> [[TMP223]], <128 x i8> [[TMP224]], <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 128, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127>
+; OPT-NEXT:    [[TMP226:%.*]] = extractelement <16 x i8> [[IN]], i64 1
+; OPT-NEXT:    [[TMP227:%.*]] = insertelement <128 x i8> [[TMP225]], i8 [[TMP226]], i32 113
+; OPT-NEXT:    [[TMP228:%.*]] = extractelement <16 x i8> [[IN]], i64 2
+; OPT-NEXT:    [[TMP229:%.*]] = insertelement <128 x i8> [[TMP227]], i8 [[TMP228]], i32 114
+; OPT-NEXT:    [[TMP230:%.*]] = extractelement <16 x i8> [[IN]], i64 3
+; OPT-NEXT:    [[TMP231:%.*]] = insertelement <128 x i8> [[TMP229]], i8 [[TMP230]], i32 115
+; OPT-NEXT:    [[TMP232:%.*]] = extractelement <16 x i8> [[IN]], i64 4
+; OPT-NEXT:    [[TMP233:%.*]] = insertelement <128 x i8> [[TMP231]], i8 [[TMP232]], i32 116
+; OPT-NEXT:    [[TMP234:%.*]] = extractelement <16 x i8> [[IN]], i64 5
+; OPT-NEXT:    [[TMP235:%.*]] = insertelement <128 x i8> [[TMP233]], i8 [[TMP234]], i32 117
+; OPT-NEXT:    [[TMP236:%.*]] = extractelement <16 x i8> [[IN]], i64 6
+; OPT-NEXT:    [[TMP237:%.*]] = insertelement <128 x i8> [[TMP235]], i8 [[TMP236]], i32 118
+; OPT-NEXT:    [[TMP238:%.*]] = extractelement <16 x i8> [[IN]], i64 7
+; OPT-NEXT:    [[TMP239:%.*]] = insertelement <128 x i8> [[TMP237]], i8 [[TMP238]], i32 119
+; OPT-NEXT:    [[TMP240:%.*]] = extractelement <16 x i8> [[IN]], i64 8
+; OPT-NEXT:    [[TMP241:%.*]] = insertelement <128 x i8> [[TMP239]], i8 [[TMP240]], i32 120
+; OPT-NEXT:    [[TMP242:%.*]] = extractelement <16 x i8> [[IN]], i64 9
+; OPT-NEXT:    [[TMP243:%.*]] = insertelement <128 x i8> [[TMP241]], i8 [[TMP242]], i32 121
+; OPT-NEXT:    [[TMP244:%.*]] = extractelement <16 x i8> [[IN]], i64 10
+; OPT-NEXT:    [[TMP245:%.*]] = insertelement <128 x i8> [[TMP243]], i8 [[TMP244]], i32 122
+; OPT-NEXT:    [[TMP246:%.*]] = extractelement <16 x i8> [[IN]], i64 11
+; OPT-NEXT:    [[TMP247:%.*]] = insertelement <128 x i8> [[TMP245]], i8 [[TMP246]], i32 123
+; OPT-NEXT:    [[TMP248:%.*]] = extractelement <16 x i8> [[IN]], i64 12
+; OPT-NEXT:    [[TMP249:%.*]] = insertelement <128 x i8> [[TMP247]], i8 [[TMP248]], i32 124
+; OPT-NEXT:    [[TMP250:%.*]] = extractelement <16 x i8> [[IN]], i64 13
+; OPT-NEXT:    [[TMP251:%.*]] = insertelement <128 x i8> [[TMP249]], i8 [[TMP250]], i32 125
+; OPT-NEXT:    [[TMP252:%.*]] = extractelement <16 x i8> [[IN]], i64 14
+; OPT-NEXT:    [[TMP253:%.*]] = insertelement <128 x i8> [[TMP251]], i8 [[TMP252]], i32 126
+; OPT-NEXT:    [[TMP254:%.*]] = extractelement <16 x i8> [[IN]], i64 15
+; OPT-NEXT:    [[TMP255:%.*]] = insertelement <128 x i8> [[TMP253]], i8 [[TMP254]], i32 127
+; OPT-NEXT:    [[TMP256:%.*]] = shufflevector <16 x i8> [[IN]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; OPT-NEXT:    [[TMP257:%.*]] = insertelement <16 x i8> [[TMP256]], i8 [[TMP162]], i64 1
+; OPT-NEXT:    [[TMP258:%.*]] = insertelement <16 x i8> [[TMP257]], i8 [[TMP164]], i64 2
+; OPT-NEXT:    [[TMP259:%.*]] = insertelement <16 x i8> [[TMP258]], i8 [[TMP166]], i64 3
+; OPT-NEXT:    [[TMP260:%.*]] = insertelement <16 x i8> [[TMP259]], i8 [[TMP168]], i64 4
+; OPT-NEXT:    [[TMP261:%.*]] = insertelement <16 x i8> [[TMP260]], i8 [[TMP170]], i64 5
+; OPT-NEXT:    [[TMP262:%.*]] = insertelement <16 x i8> [[TMP261]], i8 [[TMP172]], i64 6
+; OPT-NEXT:    [[TMP263:%.*]] = insertelement <16 x i8> [[TMP262]], i8 [[TMP174]], i64 7
+; OPT-NEXT:    [[TMP264:%.*]] = insertelement <16 x i8> [[TMP263]], i8 [[TMP176]], i64 8
+; OPT-NEXT:    [[TMP265:%.*]] = insertelement <16 x i8> [[TMP264]], i8 [[TMP178]], i64 9
+; OPT-NEXT:    [[TMP266:%.*]] = insertelement <16 x i8> [[TMP265]], i8 [[TMP180]], i64 10
+; OPT-NEXT:    [[TMP267:%.*]] = insertelement <16 x i8> [[TMP266]], i8 [[TMP182]], i64 11
+; OPT-NEXT:    [[TMP268:%.*]] = insertelement <16 x i8> [[TMP267]], i8 [[TMP184]], i64 12
+; OPT-NEXT:    [[TMP269:%.*]] = insertelement <16 x i8> [[TMP268]], i8 [[TMP186]], i64 13
+; OPT-NEXT:    [[TMP270:%.*]] = insertelement <16 x i8> [[TMP269]], i8 [[TMP188]], i64 14
+; OPT-NEXT:    [[TMP271:%.*]] = shufflevector <16 x i8> [[TMP270]], <16 x i8> [[IN]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
+; OPT-NEXT:    [[SUM:%.*]] = add <16 x i8> [[TMP271]], [[ADD]]
+; OPT-NEXT:    store <16 x i8> [[SUM]], ptr addrspace(3) [[OUT]], align 16
+; OPT-NEXT:    ret void
+;
+entry:
+  %alloca = freeze <128 x i8> poison
+  %0 = extractelement <16 x i8> %in, i64 0
+  %1 = insertelement <128 x i8> %alloca, i8 %0, i32 0
+  %2 = extractelement <16 x i8> %in, i64 1
+  %3 = insertelement <128 x i8> %1, i8 %2, i32 1
+  %4 = extractelement <16 x i8> %in, i64 2
+  %5 = insertelement <128 x i8> %3, i8 %4, i32 2
+  %6 = extractelement <16 x i8> %in, i64 3
+  %7 = insertelement <128 x i8> %5, i8 %6, i32 3
+  %8 = extractelement <16 x i8> %in, i64 4
+  %9 = insertelement <128 x i8> %7, i8 %8, i32 4
+  %10 = extractelement <16 x i8> %in, i64 5
+  %11 = insertelement <128 x i8> %9, i8 %10, i32 5
+  %12 = extractelement <16 x i8> %in, i64 6
+  %13 = insertelement <128 x i8> %11, i8 %12, i32 6
+  %14 = extractelement <16 x i8> %in, i64 7
+  %15 = insertelement <128 x i8> %13, i8 %14, i32 7
+  %16 = extractelement <16 x i8> %in, i64 8
+  %17 = insertelement <128 x i8> %15, i8 %16, i32 8
+  %18 = extractelement <16 x i8> %in, i64 9
+  %19 = insertelement <128 x i8> %17, i8 %18, i32 9
+  %20 = extractelement <16 x i8> %in, i64 10
+  %21 = insertelement <128 x i8> %19, i8 %20, i32 10
+  %22 = extractelement <16 x i8> %in, i64 11
+  %23 = insertelement <128 x i8> %21, i8 %22, i32 11
+  %24 = extractelement <16 x i8> %in, i64 12
+  %25 = insertelement <128 x i8> %23, i8 %24, i32 12
+  %26 = extractelement <16 x i8> %in, i64 13
+  %27 = insertelement <128 x i8> %25, i8 %26, i32 13
+  %28 = extractelement <16 x i8> %in, i64 14
+  %29 = insertelement <128 x i8> %27, i8 %28, i32 14
+  %30 = extractelement <16 x i8> %in, i64 15
+  %31 = insertelement <128 x i8> %29, i8 %30, i32 15
+  %32 = extractelement <16 x i8> %in, i64 0
+  %33 = insertelement <128 x i8> %31, i8 %32, i32 16
+  %34 = extractelement <16 x i8> %in, i64 1
+  %35 = insertelement <128 x i8> %33, i8 %34, i32 17
+  %36 = extractelement <16 x i8> %in, i64 2
+  %37 = insertelement <128 x i8> %35, i8 %36, i32 18
+  %38 = extractelement <16 x i8> %in, i64 3
+  %39 = insertelement <128 x i8> %37, i8 %38, i32 19
+  %40 = extractelement <16 x i8> %in, i64 4
+  %41 = insertelement <128 x i8> %39, i8 %40, i32 20
+  %42 = extractelement <16 x i8> %in, i64 5
+  %43 = insertelement <128 x i8> %41, i8 %42, i32 21
+  %44 = extractelement <16 x i8> %in, i64 6
+  %45 = insertelement <128 x i8> %43, i8 %44, i32 22
+  %46 = extractelement <16 x i8> %in, i64 7
+  %47 = insertelement <128 x i8> %45, i8 %46, i32 23
+  %48 = extractelement <16 x i8> %in, i64 8
+  %49 = insertelement <128 x i8> %47, i8 %48, i32 24
+  %50 = extractelement <16 x i8> %in, i64 9
+  %51 = insertelement <128 x i8> %49, i8 %50, i32 25
+  %52 = extractelement <16 x i8> %in, i64 10
+  %53 = insertelement <128 x i8> %51, i8 %52, i32 26
+  %54 = extractelement <16 x i8> %in, i64 11
+  %55 = insertelement <128 x i8> %53, i8 %54, i32 27
+  %56 = extractelement <16 x i8> %in, i64 12
+  %57 = insertelement <128 x i8> %55, i8 %56, i32 28
+  %58 = extractelement <16 x i8> %in, i64 13
+  %59 = insertelement <128 x i8> %57, i8 %58, i32 29
+  %60 = extractelement <16 x i8> %in, i64 14
+  %61 = insertelement <128 x i8> %59, i8 %60, i32 30
+  %62 = extractelement <16 x i8> %in, i64 15
+  %63 = insertelement <128 x i8> %61, i8 %62, i32 31
+  %64 = extractelement <16 x i8> %in, i64 0
+  %65 = insertelement <128 x i8> %63, i8 %64, i32 32
+  %66 = extractelement <16 x i8> %in, i64 1
+  %67 = insertelement <128 x i8> %65, i8 %66, i32 33
+  %68 = extractelement <16 x i8> %in, i64 2
+  %69 = insertelement <128 x i8> %67, i8 %68, i32 34
+  %70 = extractelement <16 x i8> %in, i64 3
+  %71 = insertelement <128 x i8> %69, i8 %70, i32 35
+  %72 = extractelement <16 x i8> %in, i64 4
+  %73 = insertelement <128 x i8> %71, i8 %72, i32 36
+  %74 = extractelement <16 x i8> %in, i64 5
+  %75 = insertelement <128 x i8> %73, i8 %74, i32 37
+  %76 = extractelement <16 x i8> %in, i64 6
+  %77 = insertelement <128 x i8> %75, i8 %76, i32 38
+  %78 = extractelement <16 x i8> %in, i64 7
+  %79 = insertelement <128 x i8> %77, i8 %78, i32 39
+  %80 = extractelement <16 x i8> %in, i64 8
+  %81 = insertelement <128 x i8> %79, i8 %80, i32 40
+  %82 = extractelement <16 x i8> %in, i64 9
+  %83 = insertelement <128 x i8> %81, i8 %82, i32 41
+  %84 = extractelement <16 x i8> %in, i64 10
+  %85 = insertelement <128 x i8> %83, i8 %84, i32 42
+  %86 = extractelement <16 x i8> %in, i64 11
+  %87 = insertelement <128 x i8> %85, i8 %86, i32 43
+  %88 = extractelement <16 x i8> %in, i64 12
+  %89 = insertelement <128 x i8> %87, i8 %88, i32 44
+  %90 = extractelement <16 x i8> %in, i64 13
+  %91 = insertelement <128 x i8> %89, i8 %90, i32 45
+  %92 = extractelement <16 x i8> %in, i64 14
+  %93 = insertelement <128 x i8> %91, i8 %92, i32 46
+  %94 = extractelement <16 x i8> %in, i64 15
+  %95 = insertelement <128 x i8> %93, i8 %94, i32 47
+  %96 = extractelement <16 x i8> %in, i64 0
+  %97 = insertelement <128 x i8> %95, i8 %96, i32 48
+  %98 = extractelement <16 x i8> %in, i64 1
+  %99 = insertelement <128 x i8> %97, i8 %98, i32 49
+  %100 = extractelement <16 x i8> %in, i64 2
+  %101 = insertelement <128 x i8> %99, i8 %100, i32 50
+  %102 = extractelement <16 x i8> %in, i64 3
+  %103 = insertelement <128 x i8> %101, i8 %102, i32 51
+  %104 = extractelement <16 x i8> %in, i64 4
+  %105 = insertelement <128 x i8> %103, i8 %104, i32 52
+  %106 = extractelement <16 x i8> %in, i64 5
+  %107 = insertelement <128 x i8> %105, i8 %106, i32 53
+  %108 = extractelement <16 x i8> %in, i64 6
+  %109 = insertelement <128 x i8> %107, i8 %108, i32 54
+  %110 = extractelement <16 x i8> %in, i64 7
+  %111 = insertelement <128 x i8> %109, i8 %110, i32 55
+  %112 = extractelement <16 x i8> %in, i64 8
+  %113 = insertelement <128 x i8> %111, i8 %112, i32 56
+  %114 = extractelement <16 x i8> %in, i64 9
+  %115 = insertelement <128 x i8> %113, i8 %114, i32 57
+  %116 = extractelement <16 x i8> %in, i64 10
+  %117 = insertelement <128 x i8> %115, i8 %116, i32 58
+  %118 = extractelement <16 x i8> %in, i64 11
+  %119 = insertelement <128 x i8> %117, i8 %118, i32 59
+  %120 = extractelement <16 x i8> %in, i64 12
+  %121 = insertelement <128 x i8> %119, i8 %120, i32 60
+  %122 = extractelement <16 x i8> %in, i64 13
+  %123 = insertelement <128 x i8> %121, i8 %122, i32 61
+  %124 = extractelement <16 x i8> %in, i64 14
+  %125 = insertelement <128 x i8> %123, i8 %124, i32 62
+  %126 = extractelement <16 x i8> %in, i64 15
+  %127 = insertelement <128 x i8> %125, i8 %126, i32 63
+  %128 = extractelement <16 x i8> %in, i64 0
+  %129 = insertelement <128 x i8> %127, i8 %128, i32 64
+  %130 = extractelement <16 x i8> %in, i64 1
+  %131 = insertelement <128 x i8> %129, i8 %130, i32 65
+  %132 = extractelement <16 x i8> %in, i64 2
+  %133 = insertelement <128 x i8> %131, i8 %132, i32 66
+  %134 = extractelement <16 x i8> %in, i64 3
+  %135 = insertelement <128 x i8> %133, i8 %134, i32 67
+  %136 = extractelement <16 x i8> %in, i64 4
+  %137 = insertelement <128 x i8> %135, i8 %136, i32 68
+  %138 = extractelement <16 x i8> %in, i64 5
+  %139 = insertelement <128 x i8> %137, i8 %138, i32 69
+  %140 = extractelement <16 x i8> %in, i64 6
+  %141 = insertelement <128 x i8> %139, i8 %140, i32 70
+  %142 = extractelement <16 x i8> %in, i64 7
+  %143 = insertelement <128 x i8> %141, i8 %142, i32 71
+  %144 = extractelement <16 x i8> %in, i64 8
+  %145 = insertelement <128 x i8> %143, i8 %144, i32 72
+  %146 = extractelement <16 x i8> %in, i64 9
+  %147 = insertelement <128 x i8> %145, i8 %146, i32 73
+  %148 = extractelement <16 x i8> %in, i64 10
+  %149 = insertelement <128 x i8> %147, i8 %148, i32 74
+  %150 = extractelement <16 x i8> %in, i64 11
+  %151 = insertelement <128 x i8> %149, i8 %150, i32 75
+  %152 = extractelement <16 x i8> %in, i64 12
+  %153 = insertelement <128 x i8> %151, i8 %152, i32 76
+  %154 = extractelement <16 x i8> %in, i64 13
+  %155 = insertelement <128 x i8> %153, i8 %154, i32 77
+  %156 = extractelement <16 x i8> %in, i64 14
+  %157 = insertelement <128 x i8> %155, i8 %156, i32 78
+  %158 = extractelement <16 x i8> %in, i64 15
+  %159 = insertelement <128 x i8> %157, i8 %158, i32 79
+  %160 = extractelement <16 x i8> %in, i64 0
+  %161 = insertelement <128 x i8> %159, i8 %160, i32 80
+  %162 = extractelement <16 x i8> %in, i64 1
+  %163 = insertelement <128 x i8> %161, i8 %162, i32 81
+  %164 = extractelement <16 x i8> %in, i64 2
+  %165 = insertelement <128 x i8> %163, i8 %164, i32 82
+  %166 = extractelement <16 x i8> %in, i64 3
+  %167 = insertelement <128 x i8> %165, i8 %166, i32 83
+  %168 = extractelement <16 x i8> %in, i64 4
+  %169 = insertelement <128 x i8> %167, i8 %168, i32 84
+  %170 = extractelement <16 x i8> %in, i64 5
+  %171 = insertelement <128 x i8> %169, i8 %170, i32 85
+  %172 = extractelement <16 x i8> %in, i64 6
+  %173 = insertelement <128 x i8> %171, i8 %172, i32 86
+  %174 = extractelement <16 x i8> %in, i64 7
+  %175 = insertelement <128 x i8> %173, i8 %174, i32 87
+  %176 = extractelement <16 x i8> %in, i64 8
+  %177 = insertelement <128 x i8> %175, i8 %176, i32 88
+  %178 = extractelement <16 x i8> %in, i64 9
+  %179 = insertelement <128 x i8> %177, i8 %178, i32 89
+  %180 = extractelement <16 x i8> %in, i64 10
+  %181 = insertelement <128 x i8> %179, i8 %180, i32 90
+  %182 = extractelement <16 x i8> %in, i64 11
+  %183 = insertelement <128 x i8> %181, i8 %182, i32 91
+  %184 = extractelement <16 x i8> %in, i64 12
+  %185 = insertelement <128 x i8> %183, i8 %184, i32 92
+  %186 = extractelement <16 x i8> %in, i64 13
+  %187 = insertelement <128 x i8> %185, i8 %186, i32 93
+  %188 = extractelement <16 x i8> %in, i64 14
+  %189 = insertelement <128 x i8> %187, i8 %188, i32 94
+  %190 = extractelement <16 x i8> %in, i64 15
+  %191 = insertelement <128 x i8> %189, i8 %190, i32 95
+  %192 = extractelement <16 x i8> %in, i64 0
+  %193 = insertelement <128 x i8> %191, i8 %192, i32 96
+  %194 = extractelement <16 x i8> %in, i64 1
+  %195 = insertelement <128 x i8> %193, i8 %194, i32 97
+  %196 = extractelement <16 x i8> %in, i64 2
+  %197 = insertelement <128 x i8> %195, i8 %196, i32 98
+  %198 = extractelement <16 x i8> %in, i64 3
+  %199 = insertelement <128 x i8> %197, i8 %198, i32 99
+  %200 = extractelement <16 x i8> %in, i64 4
+  %201 = insertelement <128 x i8> %199, i8 %200, i32 100
+  %202 = extractelement <16 x i8> %in, i64 5
+  %203 = insertelement <128 x i8> %201, i8 %202, i32 101
+  %204 = extractelement <16 x i8> %in, i64 6
+  %205 = insertelement <128 x i8> %203, i8 %204, i32 102
+  %206 = extractelement <16 x i8> %in, i64 7
+  %207 = insertelement <128 x i8> %205, i8 %206, i32 103
+  %208 = extractelement <16 x i8> %in, i64 8
+  %209 = insertelement <128 x i8> %207, i8 %208, i32 104
+  %210 = extractelement <16 x i8> %in, i64 9
+  %211 = insertelement <128 x i8> %209, i8 %210, i32 105
+  %212 = extractelement <16 x i8> %in, i64 10
+  %213 = insertelement <128 x i8> %211, i8 %212, i32 106
+  %214 = extractelement <16 x i8> %in, i64 11
+  %215 = insertelement <128 x i8> %213, i8 %214, i32 107
+  %216 = extractelement <16 x i8> %in, i64 12
+  %217 = insertelement <128 x i8> %215, i8 %216, i32 108
+  %218 = extractelement <16 x i8> %in, i64 13
+  %219 = insertelement <128 x i8> %217, i8 %218, i32 109
+  %220 = extractelement <16 x i8> %in, i64 14
+  %221 = insertelement <128 x i8> %219, i8 %220, i32 110
+  %222 = extractelement <16 x i8> %in, i64 15
+  %223 = insertelement <128 x i8> %221, i8 %222, i32 111
+  %224 = extractelement <16 x i8> %in, i64 0
+  %225 = insertelement <128 x i8> %223, i8 %224, i32 112
+  %226 = extractelement <16 x i8> %in, i64 1
+  %227 = insertelement <128 x i8> %225, i8 %226, i32 113
+  %228 = extractelement <16 x i8> %in, i64 2
+  %229 = insertelement <128 x i8> %227, i8 %228, i32 114
+  %230 = extractelement <16 x i8> %in, i64 3
+  %231 = insertelement <128 x i8> %229, i8 %230, i32 115
+  %232 = extractelement <16 x i8> %in, i64 4
+  %233 = insertelement <128 x i8> %231, i8 %232, i32 116
+  %234 = extractelement <16 x i8> %in, i64 5
+  %235 = insertelement <128 x i8> %233, i8 %234, i32 117
+  %236 = extractelement <16 x i8> %in, i64 6
+  %237 = insertelement <128 x i8> %235, i8 %236, i32 118
+  %238 = extractelement <16 x i8> %in, i64 7
+  %239 = insertelement <128 x i8> %237, i8 %238, i32 119
+  %240 = extractelement <16 x i8> %in, i64 8
+  %241 = insertelement <128 x i8> %239, i8 %240, i32 120
+  %242 = extractelement <16 x i8> %in, i64 9
+  %243 = insertelement <128 x i8> %241, i8 %242, i32 121
+  %244 = extractelement <16 x i8> %in, i64 10
+  %245 = insertelement <128 x i8> %243, i8 %244, i32 122
+  %246 = extractelement <16 x i8> %in, i64 11
+  %247 = insertelement <128 x i8> %245, i8 %246, i32 123
+  %248 = extractelement <16 x i8> %in, i64 12
+  %249 = insertelement <128 x i8> %247, i8 %248, i32 124
+  %250 = extractelement <16 x i8> %in, i64 13
+  %251 = insertelement <128 x i8> %249, i8 %250, i32 125
+  %252 = extractelement <16 x i8> %in, i64 14
+  %253 = insertelement <128 x i8> %251, i8 %252, i32 126
+  %254 = extractelement <16 x i8> %in, i64 15
+  %255 = insertelement <128 x i8> %253, i8 %254, i32 127
+  %256 = insertelement <16 x i8> poison, i8 %160, i64 0
+  %257 = insertelement <16 x i8> %256, i8 %162, i64 1
+  %258 = insertelement <16 x i8> %257, i8 %164, i64 2
+  %259 = insertelement <16 x i8> %258, i8 %166, i64 3
+  %260 = insertelement <16 x i8> %259, i8 %168, i64 4
+  %261 = insertelement <16 x i8> %260, i8 %170, i64 5
+  %262 = insertelement <16 x i8> %261, i8 %172, i64 6
+  %263 = insertelement <16 x i8> %262, i8 %174, i64 7
+  %264 = insertelement <16 x i8> %263, i8 %176, i64 8
+  %265 = insertelement <16 x i8> %264, i8 %178, i64 9
+  %266 = insertelement <16 x i8> %265, i8 %180, i64 10
+  %267 = insertelement <16 x i8> %266, i8 %182, i64 11
+  %268 = insertelement <16 x i8> %267, i8 %184, i64 12
+  %269 = insertelement <16 x i8> %268, i8 %186, i64 13
+  %270 = insertelement <16 x i8> %269, i8 %188, i64 14
+  %271 = insertelement <16 x i8> %270, i8 %190, i64 15
+  %sum = add <16 x i8> %271, %add
+  store <16 x i8> %sum, ptr addrspace(3) %out, align 16
+  ret void
+}
+
+attributes #0 = { "amdgpu-waves-per-eu"="2,2" }
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 11a5a5785a6ec..89f1ca6935cff 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -57,8 +57,13 @@
     # so we just exclude llvm-reduce tests from this config altogether. This should
     # be fine though as profcheck config tests are mostly concerned with opt.
     config.excludes.append("llvm-reduce")
+    # Exclude llvm-objcopy tests - not the target of this effort, and some use
+    # cat in ways that conflict with how profcheck uses it.
+    config.excludes.append("llvm-objcopy")
     # (Issue #161235) Temporarily exclude LoopVectorize.
     config.excludes.append("LoopVectorize")
+    # exclude UpdateTestChecks - they fail because of inserted prof annotations
+    config.excludes.append("UpdateTestChecks")
 
 # test_source_root: The root path where tests are located.
 config.test_source_root = os.path.dirname(__file__)
@@ -474,7 +479,7 @@ def enable_ptxas(ptxas_executable):
 config.available_features.add("host-byteorder-" + sys.byteorder + "-endian")
 if config.target_triple:
     if re.match(
-        r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|s390x|s390|tce|thumbeb)-.*",
+        r"(aarch64_be|arc|armeb|bpfeb|lanai|m68k|mips|mips64|powerpc|powerpc64|sparc|sparcv9|sparc64|s390x|s390|tce|thumbeb)-.*",
         config.target_triple,
     ):
         config.available_features.add("target-byteorder-big-endian")
@@ -579,7 +584,7 @@ def have_cxx_shared_library():
         print("could not exec llvm-readobj")
         return False
 
-    readobj_out = readobj_cmd.stdout.read().decode("ascii")
+    readobj_out = readobj_cmd.stdout.read().decode("utf-8")
     readobj_cmd.wait()
 
     regex = re.compile(r"(libc\+\+|libstdc\+\+|msvcp).*\.(so|dylib|dll)")
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
new file mode 100644
index 0000000000000..292637177591f
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
new file mode 100644
index 0000000000000..88cb03e85204a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_mixed.ll.expected
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=MIR
+
+define i64 @test1(i64 %i) nounwind readnone {
+; ASM-LABEL: test1:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movq %rdi, %rax
+; ASM-NEXT:    addq -{{[0-9]+}}(%rsp), %rax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test1
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $rdi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr64 = COPY $rdi
+; MIR-NEXT:   [[ADD64rm:%[0-9]+]]:gr64 = ADD64rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s64) from %ir.loc)
+; MIR-NEXT:   $rax = COPY [[ADD64rm]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i64
+  %j = load i64, ptr %loc
+  %r = add i64 %i, %j
+  ret i64 %r
+}
+
+define i64 @test2(i32 %i) nounwind readnone {
+; ASM-LABEL: test2:
+; ASM:       # %bb.0:
+; ASM-NEXT:    movl %edi, %eax
+; ASM-NEXT:    addl -{{[0-9]+}}(%rsp), %eax
+; ASM-NEXT:    retq
+; MIR-LABEL: name: test2
+; MIR: bb.0 (%ir-block.0):
+; MIR-NEXT:   liveins: $edi
+; MIR-NEXT: {{  $}}
+; MIR-NEXT:   [[COPY:%[0-9]+]]:gr32 = COPY $edi
+; MIR-NEXT:   [[ADD32rm:%[0-9]+]]:gr32 = ADD32rm [[COPY]], %stack.0.loc, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (dereferenceable load (s32) from %ir.loc)
+; MIR-NEXT:   [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[ADD32rm]], %subreg.sub_32bit
+; MIR-NEXT:   $rax = COPY [[SUBREG_TO_REG]]
+; MIR-NEXT:   RET 0, $rax
+  %loc = alloca i32
+  %j = load i32, ptr %loc
+  %r = add i32 %i, %j
+  %ext = zext i32 %r to i64
+  ret i64 %ext
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
new file mode 100644
index 0000000000000..7167bcf258e68
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
new file mode 100644
index 0000000000000..1ba920d1de8b0
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/x86_asm_mir_same_prefix.ll.expected
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64 < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -mtriple=x86_64 -stop-after=finalize-isel < %s | FileCheck %s --check-prefix=CHECK
+
+define i32 @add(i32 %a, i32 %b) {
+  %sum = add i32 %a, %b
+  ret i32 %sum
+}
+
+define i32 @sub(i32 %a, i32 %b) {
+  %diff = sub i32 %a, %b
+  ret i32 %diff
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
new file mode 100644
index 0000000000000..6fc57b583b37d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-mixed.test
@@ -0,0 +1,9 @@
+# REQUIRES: x86-registered-target
+## Test checking that update_llc_test_checks.py can generate both ASM and MIR checks in the same file
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_mixed.ll %t.ll && %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
+
+## Verify that running the script again on an already updated file doesn't add duplicate checks
+# RUN: %update_llc_test_checks %t.ll
+# RUN: diff -u %S/Inputs/x86_asm_mir_mixed.ll.expected %t.ll
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
new file mode 100644
index 0000000000000..bb91a44678f1a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/x86-asm-mir-same-prefix.test
@@ -0,0 +1,7 @@
+## Test that using the same prefix for both ASM and MIR outputs generates a warning
+## and doesn't produce any checks.
+
+# RUN: cp -f %S/Inputs/x86_asm_mir_same_prefix.ll %t.ll && %update_llc_test_checks %t.ll 2>&1 | FileCheck %s --check-prefix=WARNING
+# RUN: diff -u %S/Inputs/x86_asm_mir_same_prefix.ll.expected %t.ll
+
+# WARNING: WARNING: The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: CHECK
diff --git a/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test
new file mode 100644
index 0000000000000..00141f12587d4
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/swiftmodule-include-from-interface.test
@@ -0,0 +1,33 @@
+# RUN: dsymutil -include-swiftmodules-from-interface -verbose -oso-prepend-path=%p -y -o %t.dSYM  %s | FileCheck %s
+#
+# RUN: dsymutil -include-swiftmodules-from-interface --linker parallel -verbose -oso-prepend-path=%p -y %s -o %t-parallel.dSYM | FileCheck %s
+#
+# To regenerate:
+# echo ''>I.swift
+# echo ''>B.swift
+# echo 'import I'>main.swift
+# xcrun swiftc -emit-module-interface-path I.swiftinterface -enable-library-evolution I.swift
+# xcrun swiftc -emit-module-path B.swiftmodule B.swift -Xfrontend -no-serialize-debugging-options
+# xcrun swiftc -explicit-module-build main.swift -I. -module-cache-path cache -g -Xfrontend  -no-serialize-debugging-options
+# output is "B.swiftmodule" and "cache/I*.swiftmodule"
+#
+# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/Binary.swiftmodule
+# CHECK-NOT: Skipping compiled textual Swift interface: {{.*}}/Inputs/FromInterface.swiftmodule
+
+#
+---
+triple:          'arm64-apple-darwin'
+objects:
+  - filename:        '../Inputs/Binary.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+  - filename:        '../Inputs/FromInterface.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+  - filename:        '../Inputs/FromInterface.swiftmodule'
+    timestamp:       0
+    type:            50
+    symbols:         []
+...
diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test
index 1574fe35f5254..0b0bce194d575 100644
--- a/llvm/test/tools/dsymutil/cmdline.test
+++ b/llvm/test/tools/dsymutil/cmdline.test
@@ -14,6 +14,7 @@ CHECK: -fat64
 CHECK: -flat
 CHECK: -gen-reproducer
 CHECK: -help
+CHECK: -include-swiftmodules-from-interface
 CHECK: -keep-function-for-static
 CHECK: -no-object-timestamp
 CHECK: -no-odr
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml
new file mode 100644
index 0000000000000..2a8c37da80e64
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/filter-child-tag.yaml
@@ -0,0 +1,136 @@
+## Tests the --filter-child-tag (-t) option.
+
+# RUN: yaml2obj %s -o %t.o
+
+# RUN: llvm-dwarfdump %t.o --filter-child-tag=DW_TAG_structure_type | FileCheck %s --check-prefix=ONLY_STRUCT
+
+# ONLY_STRUCT: DW_TAG_compile_unit
+# ONLY_STRUCT-NOT: DW_TAG_namespace
+# ONLY_STRUCT-NOT: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -t DW_TAG_structure_type -t DW_TAG_namespace | \
+# RUN:     FileCheck %s --check-prefix=STRUCT_AND_NS --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_member
+
+# STRUCT_AND_NS: DW_TAG_compile_unit
+# STRUCT_AND_NS: DW_TAG_namespace
+# STRUCT_AND_NS: DW_TAG_structure_type
+# STRUCT_AND_NS: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=FOO_MEM --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace
+
+# FOO_MEM: DW_TAG_structure_type
+# FOO_MEM: DW_TAG_member
+# FOO_MEM: DW_TAG_member
+# FOO_MEM: DW_TAG_member
+# FOO_MEM-NOT: DW_TAG_structure_type
+# FOO_MEM-NOT: DW_TAG_member
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=SINGLE_INVALID_TAG --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace
+
+# SINGLE_INVALID_TAG: DW_TAG_structure_type
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG: DW_TAG_member
+# SINGLE_INVALID_TAG-NOT: DW_TAG_structure_type
+# SINGLE_INVALID_TAG-NOT: DW_TAG_member
+
+# RUN: llvm-dwarfdump %t.o -c --name=Foo -t not_a_tag | \
+# RUN:     FileCheck %s --check-prefix=ONLY_INVALID_TAGS --implicit-check-not=DW_TAG_compile_unit --implicit-check-not=DW_TAG_subprogram --implicit-check-not=DW_TAG_namespace --implicit-check-not=DW_TAG_member
+
+# ONLY_INVALID_TAGS: DW_TAG_structure_type
+# ONLY_INVALID_TAGS-NOT: DW_TAG_structure_type
+
+# RUN: llvm-dwarfdump %t.o -c -p --name=Foo -t DW_TAG_member | \
+# RUN:     FileCheck %s --check-prefix=FOO_MEM_WITH_PARENT --implicit-check-not=DW_TAG_subprogram
+
+# FOO_MEM_WITH_PARENT: DW_TAG_compile_unit
+# FOO_MEM_WITH_PARENT: DW_TAG_namespace
+# FOO_MEM_WITH_PARENT: DW_TAG_structure_type
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT: DW_TAG_member
+# FOO_MEM_WITH_PARENT-NOT: DW_TAG_structure_type
+# FOO_MEM_WITH_PARENT-NOT: DW_TAG_member
+
+## Not specifying --show-children ignores the --filter-child-tag option.
+# RUN: llvm-dwarfdump %t.o --name=Foo -t DW_TAG_member 2>&1 | FileCheck %s --check-prefix=NO_SHOW_CHILDREN
+
+# NO_SHOW_CHILDREN: DW_TAG_structure_type
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+DWARF:
+  debug_abbrev:
+    - Table:
+      - Tag:      DW_TAG_compile_unit
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_producer
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_namespace
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_structure_type
+        Children: DW_CHILDREN_yes
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_member
+        Children: DW_CHILDREN_no
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+      - Tag:      DW_TAG_subprogram
+        Children: DW_CHILDREN_no
+        Attributes:
+          - Attribute: DW_AT_name
+            Form:      DW_FORM_string
+  debug_info:
+    - Version: 5
+      UnitType: DW_UT_compile
+      Entries:
+        - AbbrCode: 1
+          Values:
+            - CStr: handwritten
+        - AbbrCode: 2
+          Values:
+            - CStr: ns
+        - AbbrCode: 3
+          Values:
+          - CStr: Foo
+        - AbbrCode: 4
+          Values:
+            - CStr: mem1
+        - AbbrCode: 4
+          Values:
+            - CStr: mem2
+        - AbbrCode: 4
+          Values:
+            - CStr: mem3
+        - AbbrCode: 3
+          Values:
+            - CStr: NestedInFoo
+        - AbbrCode: 4
+          Values:
+            - CStr: NestedMem1
+        - AbbrCode: 4
+          Values:
+            - CStr: NestedMem2
+        - AbbrCode: 5
+          Values:
+            - CStr: NestedFunc
+        - AbbrCode: 0x0
+        - AbbrCode: 5
+          Values:
+            - CStr: FooFunc
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
+        - AbbrCode: 0x0
diff --git a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
index a6ace2246fa8e..b800f9aa97c8f 100644
--- a/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
+++ b/llvm/test/tools/llvm-reduce/remove-attributes-from-intrinsics.ll
@@ -26,7 +26,7 @@ define i32 @t(i32 %a) {
 ; CHECK-ALL: declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 declare i32 @llvm.uadd.sat.i32(i32, i32) #0
 
-; CHECK-ALL: attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK-ALL: attributes #0 = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 
 ; CHECK-INTERESTINGNESS: attributes #1 = {
 ; CHECK-INTERESTINGNESS-SAME: "arg4"
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index b91c27e6a0f86..ee1e9060657b0 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -794,9 +794,10 @@ bool DwarfLinkerForBinary::linkImpl(
         reportWarning("Could not parse binary Swift module: " +
                           toString(FromInterfaceOrErr.takeError()),
                       Obj->getObjectFilename());
-        // Only skip swiftmodules that could be parsed and are
-        // positively identified as textual.
-      } else if (*FromInterfaceOrErr) {
+        // Only skip swiftmodules that could be parsed and are positively
+        // identified as textual. Do so only when the option allows.
+      } else if (*FromInterfaceOrErr &&
+                 !Options.IncludeSwiftModulesFromInterface) {
         if (Options.Verbose)
           outs() << "Skipping compiled textual Swift interface: "
                  << Obj->getObjectFilename() << "\n";
diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h
index ad5515a04333e..c333a3d4afee0 100644
--- a/llvm/tools/dsymutil/LinkUtils.h
+++ b/llvm/tools/dsymutil/LinkUtils.h
@@ -114,6 +114,13 @@ struct LinkOptions {
   /// Whether all remarks should be kept or only remarks with valid debug
   /// locations.
   bool RemarksKeepAll = true;
+
+  /// Whether or not to copy binary swiftmodules built from textual
+  /// .swiftinterface files into the dSYM bundle. These typically come only
+  /// from the SDK (since textual interfaces require library evolution) and
+  /// thus are a waste of space to copy into the bundle. Turn this on if the
+  /// swiftmodules are different from those in the SDK.
+  bool IncludeSwiftModulesFromInterface = false;
   /// @}
 
   LinkOptions() = default;
diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td
index ad35e55e33b12..e99bc12fa7fd8 100644
--- a/llvm/tools/dsymutil/Options.td
+++ b/llvm/tools/dsymutil/Options.td
@@ -202,6 +202,14 @@ def remarks_drop_without_debug: Flag<["--", "-"], "remarks-drop-without-debug">,
            "all remarks are kept.">,
   Group<grp_general>;
 
+def include_swiftmodules_from_interface: Flag<["--", "-"], "include-swiftmodules-from-interface">,
+  HelpText<"Whether or not to copy binary swiftmodules built from textual "
+  ".swiftinterface files into the dSYM bundle. These typically come only "
+  "from the SDK (since textual interfaces require library evolution) and "
+  "thus are a waste of space to copy into the bundle. Turn this on if the "
+  "swiftmodules are different from those in the SDK.">,
+  Group<grp_general>;
+
 def linker: Separate<["--", "-"], "linker">,
   MetaVarName<"<DWARF linker type>">,
   HelpText<"Specify the desired type of DWARF linker. Defaults to 'classic'">,
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 913077eb0b06d..688f6aaf3d0c9 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -391,6 +391,9 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
   Options.LinkOpts.RemarksKeepAll =
       !Args.hasArg(OPT_remarks_drop_without_debug);
 
+  Options.LinkOpts.IncludeSwiftModulesFromInterface =
+      Args.hasArg(OPT_include_swiftmodules_from_interface);
+
   if (opt::Arg *BuildVariantSuffix = Args.getLastArg(OPT_build_variant_suffix))
     Options.LinkOpts.BuildVariantSuffix = BuildVariantSuffix->getValue();
 
diff --git a/llvm/tools/llvm-dwarfdump/CMakeLists.txt b/llvm/tools/llvm-dwarfdump/CMakeLists.txt
index aeb1b8f14d830..7a0adf32e938c 100644
--- a/llvm/tools/llvm-dwarfdump/CMakeLists.txt
+++ b/llvm/tools/llvm-dwarfdump/CMakeLists.txt
@@ -1,4 +1,5 @@
 set(LLVM_LINK_COMPONENTS
+  BinaryFormat
   DebugInfoDWARF
   DebugInfoDWARFLowLevel
   AllTargetsDescs
diff --git a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 11eb58ea911df..6f120f93700f6 100644
--- a/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/llvm/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
@@ -242,6 +243,15 @@ static opt<bool>
                 cat(DwarfDumpCategory));
 static alias ShowParentsAlias("p", desc("Alias for --show-parents."),
                               aliasopt(ShowParents), cl::NotHidden);
+
+static list<std::string> FilterChildTag(
+    "filter-child-tag",
+    desc("When --show-children is specified, show only DIEs with the "
+         "specified DWARF tags."),
+    value_desc("list of DWARF tags"), cat(DwarfDumpCategory));
+static alias FilterChildTagAlias("t", desc("Alias for --filter-child-tag."),
+                                 aliasopt(FilterChildTag), cl::NotHidden);
+
 static opt<bool>
     ShowForm("show-form",
              desc("Show DWARF form types after the DWARF attribute types."),
@@ -330,6 +340,13 @@ static cl::extrahelp
 /// @}
 //===----------------------------------------------------------------------===//
 
+static llvm::SmallVector<unsigned>
+makeTagVector(const list<std::string> &TagStrings) {
+  return llvm::map_to_vector(TagStrings, [](const std::string &Tag) {
+    return llvm::dwarf::getTag(Tag);
+  });
+}
+
 static void error(Error Err) {
   if (!Err)
     return;
@@ -356,6 +373,7 @@ static DIDumpOptions getDumpOpts(DWARFContext &C) {
   DumpOpts.ShowAddresses = !Diff;
   DumpOpts.ShowChildren = ShowChildren;
   DumpOpts.ShowParents = ShowParents;
+  DumpOpts.FilterChildTag = makeTagVector(FilterChildTag);
   DumpOpts.ShowForm = ShowForm;
   DumpOpts.SummarizeTypes = SummarizeTypes;
   DumpOpts.Verbose = Verbose;
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 79216e89c7cba..88d6daf08d35e 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -776,6 +776,7 @@ createSharedMemoryManager(SimpleRemoteEPC &SREPC) {
       SlabSize, SREPC, SAs);
 }
 
+#if LLVM_ON_UNIX && LLVM_ENABLE_THREADS
 static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
   switch (UseMemMgr) {
   case MemMgr::Default:
@@ -789,6 +790,7 @@ static void setupEPCRemoteMemoryManager(SimpleRemoteEPC::Setup &S) {
     break;
   }
 }
+#endif
 
 static Expected<MaterializationUnit::Interface>
 getTestObjectFileInterface(Session &S, MemoryBufferRef O) {
diff --git a/llvm/tools/opt-viewer/optrecord.py b/llvm/tools/opt-viewer/optrecord.py
index b9244fd1ae739..07e21028535c4 100644
--- a/llvm/tools/opt-viewer/optrecord.py
+++ b/llvm/tools/opt-viewer/optrecord.py
@@ -344,6 +344,8 @@ def find_opt_files(*dirs_or_files):
                     d for d in subdirs if not os.path.ismount(os.path.join(dir, d))
                 ]
                 for file in files:
-                    if fnmatch.fnmatch(file, "*.opt.yaml*"):
+                    if fnmatch.fnmatch(file, "*.opt.yaml*") or fnmatch.fnmatch(
+                        file, "*.opt.ld.yaml*"
+                    ):
                         all.append(os.path.join(dir, file))
     return all
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index fbe96bb127836..99cc38b6b422b 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -10118,7 +10118,7 @@ TEST(APFloatTest, Float4E2M1FNToFloat) {
 }
 
 TEST(APFloatTest, AddOrSubtractSignificand) {
-  typedef detail::IEEEFloatUnitTestHelper Helper;
+  using Helper = detail::IEEEFloatUnitTestHelper;
   // Test cases are all combinations of:
   // {equal exponents, LHS larger exponent, RHS larger exponent}
   // {equal significands, LHS larger significand, RHS larger significand}
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index 12ba0041af551..e13523b8e10c3 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -21,7 +21,7 @@ template <typename T>
 class BitVectorTest : public ::testing::Test { };
 
 // Test both BitVector and SmallBitVector with the same suite of tests.
-typedef ::testing::Types<BitVector, SmallBitVector> BitVectorTestTypes;
+using BitVectorTestTypes = ::testing::Types<BitVector, SmallBitVector>;
 TYPED_TEST_SUITE(BitVectorTest, BitVectorTestTypes, );
 
 TYPED_TEST(BitVectorTest, TrivialOperation) {
@@ -857,7 +857,7 @@ TYPED_TEST(BitVectorTest, BinOps) {
   EXPECT_FALSE(B.anyCommon(A));
 }
 
-typedef std::vector<std::pair<int, int>> RangeList;
+using RangeList = std::vector<std::pair<int, int>>;
 
 template <typename VecType>
 static inline VecType createBitVector(uint32_t Size,
diff --git a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
index a737390e79d8d..571e4d27c6752 100644
--- a/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
+++ b/llvm/unittests/ADT/BreadthFirstIteratorTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 namespace llvm {
 
 TEST(BreadthFristIteratorTest, Basic) {
-  typedef bf_iterator<Graph<4>> BFIter;
+  using BFIter = bf_iterator<Graph<4>>;
 
   Graph<4> G;
   G.AddEdge(0, 1);
@@ -46,7 +46,7 @@ TEST(BreadthFristIteratorTest, Basic) {
 }
 
 TEST(BreadthFristIteratorTest, Cycle) {
-  typedef bf_iterator<Graph<4>> BFIter;
+  using BFIter = bf_iterator<Graph<4>>;
 
   Graph<4> G;
   G.AddEdge(0, 1);
diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
index f543947899393..918a2e63da935 100644
--- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
+++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp
@@ -15,7 +15,7 @@ using namespace llvm;
 
 namespace {
 
-typedef DAGDeltaAlgorithm::edge_ty edge_ty;
+using edge_ty = DAGDeltaAlgorithm::edge_ty;
 
 class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm {
   changeset_ty FailingSet;
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index aceb4f30d878d..273ee09fc1e28 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -129,18 +129,17 @@ typename T::mapped_type *const DenseMapTest<T>::dummy_value_ptr = nullptr;
 
 // Register these types for testing.
 // clang-format off
-typedef ::testing::Types<DenseMap<uint32_t, uint32_t>,
-                         DenseMap<uint32_t *, uint32_t *>,
-                         DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>,
-                         DenseMap<EnumClass, uint32_t>,
-                         DenseMap<std::optional<uint32_t>, uint32_t>,
-                         SmallDenseMap<uint32_t, uint32_t>,
-                         SmallDenseMap<uint32_t *, uint32_t *>,
-                         SmallDenseMap<CtorTester, CtorTester, 4,
-                                       CtorTesterMapInfo>,
-                         SmallDenseMap<EnumClass, uint32_t>,
-                         SmallDenseMap<std::optional<uint32_t>, uint32_t>
-                         > DenseMapTestTypes;
+using DenseMapTestTypes = ::testing::Types<
+    DenseMap<uint32_t, uint32_t>,
+    DenseMap<uint32_t *, uint32_t *>,
+    DenseMap<CtorTester, CtorTester, CtorTesterMapInfo>,
+    DenseMap<EnumClass, uint32_t>,
+    DenseMap<std::optional<uint32_t>, uint32_t>,
+    SmallDenseMap<uint32_t, uint32_t>,
+    SmallDenseMap<uint32_t *, uint32_t *>,
+    SmallDenseMap<CtorTester, CtorTester, 4, CtorTesterMapInfo>,
+    SmallDenseMap<EnumClass, uint32_t>,
+    SmallDenseMap<std::optional<uint32_t>, uint32_t>>;
 // clang-format on
 
 TYPED_TEST_SUITE(DenseMapTest, DenseMapTestTypes, );
diff --git a/llvm/unittests/ADT/DenseSetTest.cpp b/llvm/unittests/ADT/DenseSetTest.cpp
index a24f99b6bb34f..a2a062b151b67 100644
--- a/llvm/unittests/ADT/DenseSetTest.cpp
+++ b/llvm/unittests/ADT/DenseSetTest.cpp
@@ -96,13 +96,13 @@ template <typename T> class DenseSetTest : public testing::Test {
 };
 
 // Register these types for testing.
-typedef ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>,
-                         const DenseSet<unsigned, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 1, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
-                         const SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
-                         SmallDenseSet<unsigned, 64, TestDenseSetInfo>>
-    DenseSetTestTypes;
+using DenseSetTestTypes =
+    ::testing::Types<DenseSet<unsigned, TestDenseSetInfo>,
+                     const DenseSet<unsigned, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 1, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
+                     const SmallDenseSet<unsigned, 4, TestDenseSetInfo>,
+                     SmallDenseSet<unsigned, 64, TestDenseSetInfo>>;
 TYPED_TEST_SUITE(DenseSetTest, DenseSetTestTypes, );
 
 TYPED_TEST(DenseSetTest, Constructor) {
diff --git a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
index f792878004e7a..00312ca6044e6 100644
--- a/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
+++ b/llvm/unittests/ADT/DepthFirstIteratorTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 namespace llvm {
 
 template <typename T> struct CountedSet {
-  typedef typename SmallPtrSet<T, 4>::iterator iterator;
+  using iterator = typename SmallPtrSet<T, 4>::iterator;
 
   SmallPtrSet<T, 4> S;
   int InsertVisited = 0;
@@ -44,8 +44,8 @@ template <typename T> class df_iterator_storage<CountedSet<T>, true> {
 };
 
 TEST(DepthFirstIteratorTest, ActuallyUpdateIterator) {
-  typedef CountedSet<Graph<3>::NodeType *> StorageT;
-  typedef df_iterator<Graph<3>, StorageT, true> DFIter;
+  using StorageT = CountedSet<Graph<3>::NodeType *>;
+  using DFIter = df_iterator<Graph<3>, StorageT, true>;
 
   Graph<3> G;
   G.AddEdge(0, 1);
diff --git a/llvm/unittests/ADT/FallibleIteratorTest.cpp b/llvm/unittests/ADT/FallibleIteratorTest.cpp
index d3389744ffbfe..c17aa0393dfcb 100644
--- a/llvm/unittests/ADT/FallibleIteratorTest.cpp
+++ b/llvm/unittests/ADT/FallibleIteratorTest.cpp
@@ -19,8 +19,8 @@ using namespace llvm;
 
 namespace {
 
-using ItemValid = enum { ValidItem, InvalidItem };
-using LinkValid = enum { ValidLink, InvalidLink };
+enum ItemValid { ValidItem, InvalidItem };
+enum LinkValid { ValidLink, InvalidLink };
 
 class Item {
 public:
diff --git a/llvm/unittests/ADT/IListBaseTest.cpp b/llvm/unittests/ADT/IListBaseTest.cpp
index bd915688b190d..eeed488c28d88 100644
--- a/llvm/unittests/ADT/IListBaseTest.cpp
+++ b/llvm/unittests/ADT/IListBaseTest.cpp
@@ -19,13 +19,14 @@ template <typename T> class IListBaseTest : public ::testing::Test {};
 class Parent;
 
 // Test variants with the same test.
-typedef ::testing::Types<ilist_base<false, void>, ilist_base<true, void>, ilist_base<false, Parent*>, ilist_base<true, Parent*>>
-    IListBaseTestTypes;
+using IListBaseTestTypes =
+    ::testing::Types<ilist_base<false, void>, ilist_base<true, void>,
+                     ilist_base<false, Parent *>, ilist_base<true, Parent *>>;
 TYPED_TEST_SUITE(IListBaseTest, IListBaseTestTypes, );
 
 TYPED_TEST(IListBaseTest, insertBeforeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -51,8 +52,8 @@ TYPED_TEST(IListBaseTest, insertBeforeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -80,8 +81,8 @@ TYPED_TEST(IListBaseTest, removeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeRangeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B, C, D;
 
@@ -106,8 +107,8 @@ TYPED_TEST(IListBaseTest, removeRangeImpl) {
 }
 
 TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S, A, B;
 
@@ -126,8 +127,8 @@ TYPED_TEST(IListBaseTest, removeRangeImplAllButSentinel) {
 }
 
 TYPED_TEST(IListBaseTest, transferBeforeImpl) {
-  typedef TypeParam list_base_type;
-  typedef typename list_base_type::node_base_type node_base_type;
+  using list_base_type = TypeParam;
+  using node_base_type = typename list_base_type::node_base_type;
 
   node_base_type S1, S2, A, B, C, D, E;
 
diff --git a/llvm/unittests/ADT/IListIteratorTest.cpp b/llvm/unittests/ADT/IListIteratorTest.cpp
index 4e5b847b35ffe..54a4258246e9b 100644
--- a/llvm/unittests/ADT/IListIteratorTest.cpp
+++ b/llvm/unittests/ADT/IListIteratorTest.cpp
@@ -141,10 +141,10 @@ TEST(IListIteratorTest, ReverseConstructor) {
   L.insert(L.end(), B);
 
   // Save typing.
-  typedef simple_ilist<Node>::iterator iterator;
-  typedef simple_ilist<Node>::reverse_iterator reverse_iterator;
-  typedef simple_ilist<Node>::const_iterator const_iterator;
-  typedef simple_ilist<Node>::const_reverse_iterator const_reverse_iterator;
+  using iterator = simple_ilist<Node>::iterator;
+  using reverse_iterator = simple_ilist<Node>::reverse_iterator;
+  using const_iterator = simple_ilist<Node>::const_iterator;
+  using const_reverse_iterator = simple_ilist<Node>::const_reverse_iterator;
 
   // Check conversion values.
   EXPECT_EQ(L.begin(), iterator(L.rend()));
diff --git a/llvm/unittests/ADT/IListNodeBaseTest.cpp b/llvm/unittests/ADT/IListNodeBaseTest.cpp
index ef90c716a4118..393f83af99b76 100644
--- a/llvm/unittests/ADT/IListNodeBaseTest.cpp
+++ b/llvm/unittests/ADT/IListNodeBaseTest.cpp
@@ -17,10 +17,10 @@ namespace {
 
 class Parent {};
 
-typedef ilist_node_base<false, void> RawNode;
-typedef ilist_node_base<true, void> TrackingNode;
-typedef ilist_node_base<false, Parent> ParentNode;
-typedef ilist_node_base<true, Parent> ParentTrackingNode;
+using RawNode = ilist_node_base<false, void>;
+using TrackingNode = ilist_node_base<true, void>;
+using ParentNode = ilist_node_base<false, Parent>;
+using ParentTrackingNode = ilist_node_base<true, Parent>;
 
 TEST(IListNodeBaseTest, DefaultConstructor) {
   RawNode A;
diff --git a/llvm/unittests/ADT/IListSentinelTest.cpp b/llvm/unittests/ADT/IListSentinelTest.cpp
index 1f4a8311370a6..709a1a4bb90e7 100644
--- a/llvm/unittests/ADT/IListSentinelTest.cpp
+++ b/llvm/unittests/ADT/IListSentinelTest.cpp
@@ -14,18 +14,17 @@ using namespace llvm;
 namespace {
 
 template <class T, class... Options> struct PickSentinel {
-  typedef ilist_sentinel<
-      typename ilist_detail::compute_node_options<T, Options...>::type>
-      type;
+  using type = ilist_sentinel<
+      typename ilist_detail::compute_node_options<T, Options...>::type>;
 };
 
 class Node : public ilist_node<Node> {};
 class TrackingNode : public ilist_node<Node, ilist_sentinel_tracking<true>> {};
-typedef PickSentinel<Node>::type Sentinel;
-typedef PickSentinel<Node, ilist_sentinel_tracking<true>>::type
-    TrackingSentinel;
-typedef PickSentinel<Node, ilist_sentinel_tracking<false>>::type
-    NoTrackingSentinel;
+using Sentinel = PickSentinel<Node>::type;
+using TrackingSentinel =
+    PickSentinel<Node, ilist_sentinel_tracking<true>>::type;
+using NoTrackingSentinel =
+    PickSentinel<Node, ilist_sentinel_tracking<false>>::type;
 
 struct LocalAccess : ilist_detail::NodeAccess {
   using NodeAccess::getPrev;
diff --git a/llvm/unittests/ADT/IntervalMapTest.cpp b/llvm/unittests/ADT/IntervalMapTest.cpp
index 99a93ab198d89..38f397ff2eb54 100644
--- a/llvm/unittests/ADT/IntervalMapTest.cpp
+++ b/llvm/unittests/ADT/IntervalMapTest.cpp
@@ -14,9 +14,9 @@ using namespace llvm;
 
 namespace {
 
-typedef IntervalMap<unsigned, unsigned, 4> UUMap;
-typedef IntervalMap<unsigned, unsigned, 4,
-                    IntervalMapHalfOpenInfo<unsigned>> UUHalfOpenMap;
+using UUMap = IntervalMap<unsigned, unsigned, 4>;
+using UUHalfOpenMap =
+    IntervalMap<unsigned, unsigned, 4, IntervalMapHalfOpenInfo<unsigned>>;
 
 // Empty map tests
 TEST(IntervalMapTest, EmptyMap) {
@@ -713,7 +713,7 @@ TEST(IntervalMapTest, OverlapsHalfOpen) {
 }
 
 TEST(IntervalMapOverlapsTest, SmallMaps) {
-  typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps;
+  using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>;
   UUMap::Allocator allocator;
   UUMap mapA(allocator);
   UUMap mapB(allocator);
@@ -757,7 +757,7 @@ TEST(IntervalMapOverlapsTest, SmallMaps) {
 }
 
 TEST(IntervalMapOverlapsTest, BigMaps) {
-  typedef IntervalMapOverlaps<UUMap,UUMap> UUOverlaps;
+  using UUOverlaps = IntervalMapOverlaps<UUMap, UUMap>;
   UUMap::Allocator allocator;
   UUMap mapA(allocator);
   UUMap mapB(allocator);
diff --git a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
index f4f2083482804..6da42271764bc 100644
--- a/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
+++ b/llvm/unittests/ADT/IntrusiveRefCntPtrTest.cpp
@@ -25,9 +25,9 @@ struct SimpleRefCounted : Base<SimpleRefCounted<Base>> {
 
 template <typename T> struct IntrusiveRefCntPtrTest : testing::Test {};
 
-typedef ::testing::Types<SimpleRefCounted<RefCountedBase>,
-                         SimpleRefCounted<ThreadSafeRefCountedBase>>
-    IntrusiveRefCntTypes;
+using IntrusiveRefCntTypes =
+    ::testing::Types<SimpleRefCounted<RefCountedBase>,
+                     SimpleRefCounted<ThreadSafeRefCountedBase>>;
 TYPED_TEST_SUITE(IntrusiveRefCntPtrTest, IntrusiveRefCntTypes, );
 
 TYPED_TEST(IntrusiveRefCntPtrTest, RefCountedBaseCopyDoesNotLeak) {
diff --git a/llvm/unittests/ADT/IteratorTest.cpp b/llvm/unittests/ADT/IteratorTest.cpp
index b5d63efd8ccba..9dd8c1a84f44a 100644
--- a/llvm/unittests/ADT/IteratorTest.cpp
+++ b/llvm/unittests/ADT/IteratorTest.cpp
@@ -177,8 +177,8 @@ TEST(PointeeIteratorTest, Basic) {
   V.push_back(&arr[2]);
   V.push_back(&arr[3]);
 
-  typedef pointee_iterator<SmallVectorImpl<int *>::const_iterator>
-      test_iterator;
+  using test_iterator =
+      pointee_iterator<SmallVectorImpl<int *>::const_iterator>;
 
   test_iterator Begin, End;
   Begin = V.begin();
@@ -218,9 +218,8 @@ TEST(PointeeIteratorTest, SmartPointer) {
   V.push_back(std::make_unique<int>(3));
   V.push_back(std::make_unique<int>(4));
 
-  typedef pointee_iterator<
-      SmallVectorImpl<std::unique_ptr<int>>::const_iterator>
-      test_iterator;
+  using test_iterator =
+      pointee_iterator<SmallVectorImpl<std::unique_ptr<int>>::const_iterator>;
 
   test_iterator Begin, End;
   Begin = V.begin();
diff --git a/llvm/unittests/ADT/PointerSumTypeTest.cpp b/llvm/unittests/ADT/PointerSumTypeTest.cpp
index fbf59f3a2fda5..11e657ad8bd25 100644
--- a/llvm/unittests/ADT/PointerSumTypeTest.cpp
+++ b/llvm/unittests/ADT/PointerSumTypeTest.cpp
@@ -17,10 +17,9 @@ struct PointerSumTypeTest : public testing::Test {
   float f;
   int i1, i2;
 
-  typedef PointerSumType<Kinds, PointerSumTypeMember<Float, float *>,
-                         PointerSumTypeMember<Int1, int *>,
-                         PointerSumTypeMember<Int2, int *>>
-      SumType;
+  using SumType = PointerSumType<Kinds, PointerSumTypeMember<Float, float *>,
+                                 PointerSumTypeMember<Int1, int *>,
+                                 PointerSumTypeMember<Int2, int *>>;
   SumType a, b, c, n;
 
   PointerSumTypeTest()
diff --git a/llvm/unittests/ADT/PointerUnionTest.cpp b/llvm/unittests/ADT/PointerUnionTest.cpp
index acddb78960149..d8ac3aed76da2 100644
--- a/llvm/unittests/ADT/PointerUnionTest.cpp
+++ b/llvm/unittests/ADT/PointerUnionTest.cpp
@@ -12,9 +12,9 @@ using namespace llvm;
 
 namespace {
 
-typedef PointerUnion<int *, float *> PU;
-typedef PointerUnion<int *, float *, long long *> PU3;
-typedef PointerUnion<int *, float *, long long *, double *> PU4;
+using PU = PointerUnion<int *, float *>;
+using PU3 = PointerUnion<int *, float *, long long *>;
+using PU4 = PointerUnion<int *, float *, long long *, double *>;
 
 struct PointerUnionTest : public testing::Test {
   float f;
@@ -116,9 +116,9 @@ TEST_F(PointerUnionTest, Get) {
 
 template<int I> struct alignas(8) Aligned {};
 
-typedef PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *,
-                     Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *>
-    PU8;
+using PU8 =
+    PointerUnion<Aligned<0> *, Aligned<1> *, Aligned<2> *, Aligned<3> *,
+                 Aligned<4> *, Aligned<5> *, Aligned<6> *, Aligned<7> *>;
 
 TEST_F(PointerUnionTest, ManyElements) {
   Aligned<0> a0;
diff --git a/llvm/unittests/ADT/PostOrderIteratorTest.cpp b/llvm/unittests/ADT/PostOrderIteratorTest.cpp
index 838481f76ed7f..e875dd63a1958 100644
--- a/llvm/unittests/ADT/PostOrderIteratorTest.cpp
+++ b/llvm/unittests/ADT/PostOrderIteratorTest.cpp
@@ -23,7 +23,7 @@ namespace {
 
 // Whether we're able to compile
 TEST(PostOrderIteratorTest, Compiles) {
-  typedef SmallPtrSet<void *, 4> ExtSetTy;
+  using ExtSetTy = SmallPtrSet<void *, 4>;
 
   // Tests that template specializations are kept up to date
   void *Null = nullptr;
diff --git a/llvm/unittests/ADT/PriorityWorklistTest.cpp b/llvm/unittests/ADT/PriorityWorklistTest.cpp
index f12d32ac9f496..08a47736c392e 100644
--- a/llvm/unittests/ADT/PriorityWorklistTest.cpp
+++ b/llvm/unittests/ADT/PriorityWorklistTest.cpp
@@ -20,8 +20,8 @@ namespace {
 using namespace llvm;
 
 template <typename T> class PriorityWorklistTest : public ::testing::Test {};
-typedef ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>>
-    TestTypes;
+using TestTypes =
+    ::testing::Types<PriorityWorklist<int>, SmallPriorityWorklist<int, 2>>;
 TYPED_TEST_SUITE(PriorityWorklistTest, TestTypes, );
 
 TYPED_TEST(PriorityWorklistTest, Basic) {
diff --git a/llvm/unittests/ADT/RangeAdapterTest.cpp b/llvm/unittests/ADT/RangeAdapterTest.cpp
index c1a8a984f233b..6849ccbc8052d 100644
--- a/llvm/unittests/ADT/RangeAdapterTest.cpp
+++ b/llvm/unittests/ADT/RangeAdapterTest.cpp
@@ -24,8 +24,8 @@ class ReverseOnlyVector {
 public:
   ReverseOnlyVector(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
-  typedef std::vector<int>::const_reverse_iterator const_reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
+  using const_reverse_iterator = std::vector<int>::const_reverse_iterator;
   reverse_iterator rbegin() { return Vec.rbegin(); }
   reverse_iterator rend() { return Vec.rend(); }
   const_reverse_iterator rbegin() const { return Vec.rbegin(); }
@@ -41,11 +41,11 @@ class BidirectionalVector {
 public:
   BidirectionalVector(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::iterator iterator;
+  using iterator = std::vector<int>::iterator;
   iterator begin() const;
   iterator end() const;
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
   reverse_iterator rbegin() const { return Vec.rbegin(); }
   reverse_iterator rend() const { return Vec.rend(); }
 };
@@ -58,15 +58,15 @@ class BidirectionalVectorConsts {
 public:
   BidirectionalVectorConsts(std::initializer_list<int> list) : Vec(list) {}
 
-  typedef std::vector<int>::iterator iterator;
-  typedef std::vector<int>::const_iterator const_iterator;
+  using iterator = std::vector<int>::iterator;
+  using const_iterator = std::vector<int>::const_iterator;
   iterator begin();
   iterator end();
   const_iterator begin() const;
   const_iterator end() const;
 
-  typedef std::vector<int>::reverse_iterator reverse_iterator;
-  typedef std::vector<int>::const_reverse_iterator const_reverse_iterator;
+  using reverse_iterator = std::vector<int>::reverse_iterator;
+  using const_reverse_iterator = std::vector<int>::const_reverse_iterator;
   reverse_iterator rbegin() { return Vec.rbegin(); }
   reverse_iterator rend() { return Vec.rend(); }
   const_reverse_iterator rbegin() const { return Vec.rbegin(); }
@@ -80,7 +80,7 @@ class CustomIteratorVector {
 public:
   CustomIteratorVector(std::initializer_list<int> list) : V(list) {}
 
-  typedef std::vector<int>::iterator iterator;
+  using iterator = std::vector<int>::iterator;
   class reverse_iterator {
     std::vector<int>::iterator I;
 
@@ -126,8 +126,8 @@ template <typename R> void TestRev(const R &r) {
 // Test fixture
 template <typename T> class RangeAdapterLValueTest : public ::testing::Test {};
 
-typedef ::testing::Types<std::vector<int>, std::list<int>, int[4]>
-    RangeAdapterLValueTestTypes;
+using RangeAdapterLValueTestTypes =
+    ::testing::Types<std::vector<int>, std::list<int>, int[4]>;
 TYPED_TEST_SUITE(RangeAdapterLValueTest, RangeAdapterLValueTestTypes, );
 
 TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) {
@@ -140,10 +140,10 @@ TYPED_TEST(RangeAdapterLValueTest, TrivialOperation) {
 
 template <typename T> struct RangeAdapterRValueTest : testing::Test {};
 
-typedef ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector,
-                         ReverseOnlyVector, BidirectionalVector,
-                         BidirectionalVectorConsts>
-    RangeAdapterRValueTestTypes;
+using RangeAdapterRValueTestTypes =
+    ::testing::Types<std::vector<int>, std::list<int>, CustomIteratorVector,
+                     ReverseOnlyVector, BidirectionalVector,
+                     BidirectionalVectorConsts>;
 TYPED_TEST_SUITE(RangeAdapterRValueTest, RangeAdapterRValueTestTypes, );
 
 TYPED_TEST(RangeAdapterRValueTest, TrivialOperation) {
diff --git a/llvm/unittests/ADT/SCCIteratorTest.cpp b/llvm/unittests/ADT/SCCIteratorTest.cpp
index 48350959d046b..5f088294b1a2d 100644
--- a/llvm/unittests/ADT/SCCIteratorTest.cpp
+++ b/llvm/unittests/ADT/SCCIteratorTest.cpp
@@ -21,7 +21,7 @@ TEST(SCCIteratorTest, AllSmallGraphs) {
   // create graphs for which every node has a self-edge.
 #define NUM_NODES 4
 #define NUM_GRAPHS (NUM_NODES * (NUM_NODES - 1))
-  typedef Graph<NUM_NODES> GT;
+  using GT = Graph<NUM_NODES>;
 
   /// Enumerate all graphs using NUM_GRAPHS bits.
   static_assert(NUM_GRAPHS < sizeof(unsigned) * CHAR_BIT, "Too many graphs!");
diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp
index 966b1f01e8a31..85567775e4ebd 100644
--- a/llvm/unittests/ADT/STLExtrasTest.cpp
+++ b/llvm/unittests/ADT/STLExtrasTest.cpp
@@ -60,7 +60,7 @@ TEST(STLExtrasTest, EnumerateLValue) {
   // Test that a simple LValue can be enumerated and gives correct results with
   // multiple types, including the empty container.
   std::vector<char> foo = {'a', 'b', 'c'};
-  typedef std::pair<std::size_t, char> CharPairType;
+  using CharPairType = std::pair<std::size_t, char>;
   std::vector<CharPairType> CharResults;
 
   for (auto [index, value] : llvm::enumerate(foo)) {
@@ -72,7 +72,7 @@ TEST(STLExtrasTest, EnumerateLValue) {
                           CharPairType(2u, 'c')));
 
   // Test a const range of a different type.
-  typedef std::pair<std::size_t, int> IntPairType;
+  using IntPairType = std::pair<std::size_t, int>;
   std::vector<IntPairType> IntResults;
   const std::vector<int> bar = {1, 2, 3};
   for (auto [index, value] : llvm::enumerate(bar)) {
@@ -111,7 +111,7 @@ TEST(STLExtrasTest, EnumerateModifyLValue) {
 
 TEST(STLExtrasTest, EnumerateRValueRef) {
   // Test that an rvalue can be enumerated.
-  typedef std::pair<std::size_t, int> PairType;
+  using PairType = std::pair<std::size_t, int>;
   std::vector<PairType> Results;
 
   auto Enumerator = llvm::enumerate(std::vector<int>{1, 2, 3});
@@ -138,7 +138,7 @@ TEST(STLExtrasTest, EnumerateModifyRValue) {
   // Test that when enumerating an rvalue, modification still works (even if
   // this isn't terribly useful, it at least shows that we haven't snuck an
   // extra const in there somewhere.
-  typedef std::pair<std::size_t, char> PairType;
+  using PairType = std::pair<std::size_t, char>;
   std::vector<PairType> Results;
 
   for (auto X : llvm::enumerate(std::vector<char>{'1', '2', '3'})) {
diff --git a/llvm/unittests/ADT/SimpleIListTest.cpp b/llvm/unittests/ADT/SimpleIListTest.cpp
index c2992baf8a5f7..cf3df8c293e25 100644
--- a/llvm/unittests/ADT/SimpleIListTest.cpp
+++ b/llvm/unittests/ADT/SimpleIListTest.cpp
@@ -605,8 +605,8 @@ struct Tag2 {};
 
 struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>,
                     ilist_node<DoubleNode, ilist_tag<Tag2>> {
-  typedef ilist_node<DoubleNode, ilist_tag<Tag1>> Node1Type;
-  typedef ilist_node<DoubleNode, ilist_tag<Tag2>> Node2Type;
+  using Node1Type = ilist_node<DoubleNode, ilist_tag<Tag1>>;
+  using Node2Type = ilist_node<DoubleNode, ilist_tag<Tag2>>;
 
   Node1Type::self_iterator getIterator1() { return Node1Type::getIterator(); }
   Node2Type::self_iterator getIterator2() { return Node2Type::getIterator(); }
@@ -617,8 +617,8 @@ struct DoubleNode : ilist_node<DoubleNode, ilist_tag<Tag1>>,
     return Node2Type::getIterator();
   }
 };
-typedef simple_ilist<DoubleNode, ilist_tag<Tag1>> TaggedList1Type;
-typedef simple_ilist<DoubleNode, ilist_tag<Tag2>> TaggedList2Type;
+using TaggedList1Type = simple_ilist<DoubleNode, ilist_tag<Tag1>>;
+using TaggedList2Type = simple_ilist<DoubleNode, ilist_tag<Tag2>>;
 
 TEST(SimpleIListTest, TaggedLists) {
   TaggedList1Type L1;
diff --git a/llvm/unittests/ADT/SmallPtrSetTest.cpp b/llvm/unittests/ADT/SmallPtrSetTest.cpp
index a627091b90c70..fe7a8279d06b1 100644
--- a/llvm/unittests/ADT/SmallPtrSetTest.cpp
+++ b/llvm/unittests/ADT/SmallPtrSetTest.cpp
@@ -57,7 +57,7 @@ TEST(SmallPtrSetTest, GrowthTest) {
 
 
   SmallPtrSet<int *, 4> s;
-  typedef SmallPtrSet<int *, 4>::iterator iter;
+  using iter = SmallPtrSet<int *, 4>::iterator;
 
   s.insert(&buf[0]);
   s.insert(&buf[1]);
diff --git a/llvm/unittests/ADT/SmallStringTest.cpp b/llvm/unittests/ADT/SmallStringTest.cpp
index 2f4df8afeafa5..db858246c9bbf 100644
--- a/llvm/unittests/ADT/SmallStringTest.cpp
+++ b/llvm/unittests/ADT/SmallStringTest.cpp
@@ -23,7 +23,7 @@ namespace {
 // Test fixture class
 class SmallStringTest : public testing::Test {
 protected:
-  typedef SmallString<40> StringType;
+  using StringType = SmallString<40>;
 
   StringType theString;
 
diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp
index 74fc737f29335..dbc626db54482 100644
--- a/llvm/unittests/ADT/SmallVectorTest.cpp
+++ b/llvm/unittests/ADT/SmallVectorTest.cpp
@@ -226,13 +226,10 @@ class SmallVectorTest : public SmallVectorTestBase {
   VectorT otherVector;
 };
 
-
-typedef ::testing::Types<SmallVector<Constructable, 0>,
-                         SmallVector<Constructable, 1>,
-                         SmallVector<Constructable, 2>,
-                         SmallVector<Constructable, 4>,
-                         SmallVector<Constructable, 5>
-                         > SmallVectorTestTypes;
+using SmallVectorTestTypes = ::testing::Types<
+    SmallVector<Constructable, 0>, SmallVector<Constructable, 1>,
+    SmallVector<Constructable, 2>, SmallVector<Constructable, 4>,
+    SmallVector<Constructable, 5>>;
 TYPED_TEST_SUITE(SmallVectorTest, SmallVectorTestTypes, );
 
 // Constructor test.
@@ -537,11 +534,11 @@ TYPED_TEST(SmallVectorTest, AppendNonIterTest) {
 }
 
 struct output_iterator {
-  typedef std::output_iterator_tag iterator_category;
-  typedef int value_type;
-  typedef int difference_type;
-  typedef value_type *pointer;
-  typedef value_type &reference;
+  using iterator_category = std::output_iterator_tag;
+  using value_type = int;
+  using difference_type = int;
+  using pointer = value_type *;
+  using reference = value_type &;
   operator int() { return 2; }
   operator Constructable() { return 7; }
 };
@@ -896,7 +893,7 @@ class DualSmallVectorsTest<std::pair<VectorT1, VectorT2>> : public SmallVectorTe
   VectorT2 otherVector;
 };
 
-typedef ::testing::Types<
+using DualSmallVectorTestTypes = ::testing::Types<
     // Small mode -> Small mode.
     std::pair<SmallVector<Constructable, 4>, SmallVector<Constructable, 4>>,
     // Small mode -> Big mode.
@@ -904,8 +901,7 @@ typedef ::testing::Types<
     // Big mode -> Small mode.
     std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 4>>,
     // Big mode -> Big mode.
-    std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>
-  > DualSmallVectorTestTypes;
+    std::pair<SmallVector<Constructable, 2>, SmallVector<Constructable, 2>>>;
 
 TYPED_TEST_SUITE(DualSmallVectorsTest, DualSmallVectorTestTypes, );
 
diff --git a/llvm/unittests/ADT/SparseMultiSetTest.cpp b/llvm/unittests/ADT/SparseMultiSetTest.cpp
index 54f7bc99b52fa..91d37f4684b9e 100644
--- a/llvm/unittests/ADT/SparseMultiSetTest.cpp
+++ b/llvm/unittests/ADT/SparseMultiSetTest.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 namespace {
 
-typedef SparseMultiSet<unsigned> USet;
+using USet = SparseMultiSet<unsigned>;
 
 // Empty set tests.
 TEST(SparseMultiSetTest, EmptySet) {
@@ -211,7 +211,7 @@ struct Alt {
 };
 
 TEST(SparseMultiSetTest, AltStructSet) {
-  typedef SparseMultiSet<Alt> ASet;
+  using ASet = SparseMultiSet<Alt>;
   ASet Set;
   Set.setUniverse(10);
   Set.insert(Alt(1005));
diff --git a/llvm/unittests/ADT/SparseSetTest.cpp b/llvm/unittests/ADT/SparseSetTest.cpp
index 4fbf1caa247b7..f2b932907dc38 100644
--- a/llvm/unittests/ADT/SparseSetTest.cpp
+++ b/llvm/unittests/ADT/SparseSetTest.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 namespace {
 
-typedef SparseSet<unsigned> USet;
+using USet = SparseSet<unsigned>;
 
 // Empty set tests.
 TEST(SparseSetTest, EmptySet) {
@@ -166,7 +166,7 @@ struct Alt {
 };
 
 TEST(SparseSetTest, AltStructSet) {
-  typedef SparseSet<Alt> ASet;
+  using ASet = SparseSet<Alt>;
   ASet Set;
   Set.setUniverse(10);
   Set.insert(Alt(1005));
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index c94feb54d0b7d..75d50f4dd1b5b 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -240,6 +240,23 @@ TEST(StringSwitchTest, CasesCopies) {
   EXPECT_EQ(NumCopies, 1u);
 }
 
+TEST(StringSwitchTest, StringSwitchMultipleMatches) {
+  auto Translate = [](StringRef S) {
+    return llvm::StringSwitch<int>(S)
+        .CaseLower("A", 0)
+        .Case("b", 1)
+        .Case("a", 2)
+        .CasesLower({"a", "b"}, 3)
+        .DefaultUnreachable();
+  };
+
+  // Check that the value of the first match is returned.
+  EXPECT_EQ(0, Translate("A"));
+  EXPECT_EQ(0, Translate("a"));
+  EXPECT_EQ(3, Translate("B"));
+  EXPECT_EQ(1, Translate("b"));
+}
+
 TEST(StringSwitchTest, DefaultUnreachable) {
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<int>(S)
diff --git a/llvm/unittests/ADT/TestGraph.h b/llvm/unittests/ADT/TestGraph.h
index a59ab504f7144..bb2ec47a0d5fe 100644
--- a/llvm/unittests/ADT/TestGraph.h
+++ b/llvm/unittests/ADT/TestGraph.h
@@ -34,7 +34,7 @@ class Graph {
 
   /// NodeSubset - A subset of the graph's nodes.
   class NodeSubset {
-    typedef unsigned char BitVector; // Where the limitation N <= 8 comes from.
+    using BitVector = unsigned char; // Where the limitation N <= 8 comes from.
     BitVector Elements;
     NodeSubset(BitVector e) : Elements(e) {}
   public:
@@ -96,7 +96,7 @@ class Graph {
   };
 
   /// NodeType - Node index and set of children of the node.
-  typedef std::pair<unsigned, NodeSubset> NodeType;
+  using NodeType = std::pair<unsigned, NodeSubset>;
 
 private:
   /// Nodes - The list of nodes for this graph.
@@ -233,8 +233,8 @@ class Graph {
 
 template <unsigned N>
 struct GraphTraits<Graph<N> > {
-  typedef typename Graph<N>::NodeType *NodeRef;
-  typedef typename Graph<N>::ChildIterator ChildIteratorType;
+  using NodeRef = typename Graph<N>::NodeType *;
+  using ChildIteratorType = typename Graph<N>::ChildIterator;
 
   static NodeRef getEntryNode(const Graph<N> &G) { return G.AccessNode(0); }
   static ChildIteratorType child_begin(NodeRef Node) {
diff --git a/llvm/unittests/ADT/TinyPtrVectorTest.cpp b/llvm/unittests/ADT/TinyPtrVectorTest.cpp
index af4ae4f4b0db9..c77721df5055c 100644
--- a/llvm/unittests/ADT/TinyPtrVectorTest.cpp
+++ b/llvm/unittests/ADT/TinyPtrVectorTest.cpp
@@ -28,14 +28,14 @@ template <typename PointerTy, unsigned IntBits, typename IntType,
           typename PtrTraits, typename Info>
 struct RemovePointer<
     PointerIntPair<PointerTy, IntBits, IntType, PtrTraits, Info>> {
-  typedef typename RemovePointer<PointerTy>::type type;
+  using type = typename RemovePointer<PointerTy>::type;
 };
 
 template <typename VectorT>
 class TinyPtrVectorTest : public testing::Test {
 protected:
-  typedef typename VectorT::value_type PtrT;
-  typedef typename RemovePointer<PtrT>::type ValueT;
+  using PtrT = typename VectorT::value_type;
+  using ValueT = typename RemovePointer<PtrT>::type;
   using PtrTraits = PointerLikeTypeTraits<PtrT>;
 
   VectorT V;
@@ -78,9 +78,9 @@ class TinyPtrVectorTest : public testing::Test {
   }
 };
 
-typedef ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>,
-                         TinyPtrVector<PointerIntPair<int *, 1>>>
-    TinyPtrVectorTestTypes;
+using TinyPtrVectorTestTypes =
+    ::testing::Types<TinyPtrVector<int *>, TinyPtrVector<double *>,
+                     TinyPtrVector<PointerIntPair<int *, 1>>>;
 TYPED_TEST_SUITE(TinyPtrVectorTest, TinyPtrVectorTestTypes, );
 
 TYPED_TEST(TinyPtrVectorTest, EmptyTest) {
diff --git a/llvm/unittests/CAS/ActionCacheTest.cpp b/llvm/unittests/CAS/ActionCacheTest.cpp
index db67e30ca203b..692da230b6e09 100644
--- a/llvm/unittests/CAS/ActionCacheTest.cpp
+++ b/llvm/unittests/CAS/ActionCacheTest.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 using namespace llvm::cas;
 
 TEST_P(CASTest, ActionCacheHit) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID;
@@ -36,7 +36,7 @@ TEST_P(CASTest, ActionCacheHit) {
 }
 
 TEST_P(CASTest, ActionCacheMiss) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID1, ID2;
@@ -59,7 +59,7 @@ TEST_P(CASTest, ActionCacheMiss) {
 }
 
 TEST_P(CASTest, ActionCacheRewrite) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   std::unique_ptr<ActionCache> Cache = createActionCache();
 
   std::optional<ObjectProxy> ID1, ID2;
diff --git a/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
new file mode 100644
index 0000000000000..19522e9372d85
--- /dev/null
+++ b/llvm/unittests/CAS/BuiltinUnifiedCASDatabasesTest.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/BuiltinUnifiedCASDatabases.h"
+#include "CASTestConfig.h"
+#include "llvm/CAS/ActionCache.h"
+#include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+
+TEST_F(OnDiskCASTest, UnifiedCASMaterializationCheckPreventsGarbageCollection) {
+  unittest::TempDir Temp("on-disk-unified-cas", /*Unique=*/true);
+
+  auto WithCAS = [&](llvm::function_ref<void(ObjectStore &)> Action) {
+    std::pair<std::unique_ptr<ObjectStore>, std::unique_ptr<ActionCache>> DBs;
+    ASSERT_THAT_ERROR(
+        createOnDiskUnifiedCASDatabases(Temp.path()).moveInto(DBs),
+        Succeeded());
+    ObjectStore &CAS = *DBs.first;
+    ASSERT_THAT_ERROR(CAS.setSizeLimit(1), Succeeded());
+    Action(CAS);
+  };
+
+  std::optional<CASID> ID;
+
+  // Create an object in the CAS.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref;
+    ASSERT_THAT_ERROR(CAS.store({}, "blah").moveInto(Ref), Succeeded());
+    ASSERT_TRUE(Ref.has_value());
+
+    ID = CAS.getID(*Ref);
+  });
+
+  // Check materialization and prune the storage.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref = CAS.getReference(*ID);
+    ASSERT_TRUE(Ref.has_value());
+
+    std::optional<bool> IsMaterialized;
+    ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized),
+                      Succeeded());
+    ASSERT_TRUE(IsMaterialized);
+
+    ASSERT_THAT_ERROR(CAS.pruneStorageData(), Succeeded());
+  });
+
+  // Verify that the previous materialization check kept the object in the CAS.
+  WithCAS([&ID](ObjectStore &CAS) {
+    std::optional<ObjectRef> Ref = CAS.getReference(*ID);
+    ASSERT_TRUE(Ref.has_value());
+
+    std::optional<bool> IsMaterialized;
+    ASSERT_THAT_ERROR(CAS.isMaterialized(*Ref).moveInto(IsMaterialized),
+                      Succeeded());
+    ASSERT_TRUE(IsMaterialized);
+  });
+}
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 10e4b689e151e..08cbf1daf727d 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -8,6 +8,7 @@
 
 #include "CASTestConfig.h"
 #include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/Error.h"
 #include "gtest/gtest.h"
 #include <mutex>
 
@@ -15,7 +16,8 @@ using namespace llvm;
 using namespace llvm::cas;
 
 static CASTestingEnv createInMemory(int I) {
-  return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache()};
+  return CASTestingEnv{createInMemoryCAS(), createInMemoryActionCache(),
+                       std::nullopt};
 }
 
 INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
@@ -23,7 +25,7 @@ INSTANTIATE_TEST_SUITE_P(InMemoryCAS, CASTest,
 
 #if LLVM_ENABLE_ONDISK_CAS
 namespace llvm::cas::ondisk {
-extern void setMaxMappingSize(uint64_t Size);
+void setMaxMappingSize(uint64_t Size);
 } // namespace llvm::cas::ondisk
 
 void setMaxOnDiskCASMappingSize() {
@@ -31,6 +33,17 @@ void setMaxOnDiskCASMappingSize() {
   std::call_once(
       Flag, [] { llvm::cas::ondisk::setMaxMappingSize(100 * 1024 * 1024); });
 }
+
+static CASTestingEnv createOnDisk(int I) {
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  std::unique_ptr<ObjectStore> CAS;
+  EXPECT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+  std::unique_ptr<ActionCache> Cache;
+  EXPECT_THAT_ERROR(createOnDiskActionCache(Temp.path()).moveInto(Cache),
+                    Succeeded());
+  return CASTestingEnv{std::move(CAS), std::move(Cache), std::move(Temp)};
+}
+INSTANTIATE_TEST_SUITE_P(OnDiskCAS, CASTest, ::testing::Values(createOnDisk));
 #else
 void setMaxOnDiskCASMappingSize() {}
 #endif /* LLVM_ENABLE_ONDISK_CAS */
diff --git a/llvm/unittests/CAS/CASTestConfig.h b/llvm/unittests/CAS/CASTestConfig.h
index 8d3c55305f1b3..b1c0e59ff2b92 100644
--- a/llvm/unittests/CAS/CASTestConfig.h
+++ b/llvm/unittests/CAS/CASTestConfig.h
@@ -6,16 +6,28 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
+#define LLVM_UNITTESTS_CASTESTCONFIG_H
+
 #include "llvm/CAS/ActionCache.h"
 #include "llvm/CAS/ObjectStore.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
 #include "gtest/gtest.h"
+#include <memory>
 
-#ifndef LLVM_UNITTESTS_CASTESTCONFIG_H
-#define LLVM_UNITTESTS_CASTESTCONFIG_H
+namespace llvm::unittest::cas {
+class MockEnv {
+  void anchor();
+
+public:
+  virtual ~MockEnv();
+};
+} // namespace llvm::unittest::cas
 
 struct CASTestingEnv {
   std::unique_ptr<llvm::cas::ObjectStore> CAS;
   std::unique_ptr<llvm::cas::ActionCache> Cache;
+  std::optional<llvm::unittest::TempDir> Temp;
 };
 
 void setMaxOnDiskCASMappingSize();
@@ -24,26 +36,47 @@ void setMaxOnDiskCASMappingSize();
 class OnDiskCASTest : public ::testing::Test {
 protected:
   void SetUp() override {
+#if !LLVM_ENABLE_ONDISK_CAS
+    GTEST_SKIP() << "OnDiskCAS is not enabled";
+#endif
     // Use a smaller database size for testing to conserve disk space.
     setMaxOnDiskCASMappingSize();
   }
 };
 
+// Parametered test fixture for ObjectStore and ActionCache tests.
 class CASTest
     : public testing::TestWithParam<std::function<CASTestingEnv(int)>> {
 protected:
   std::optional<int> NextCASIndex;
 
+  llvm::SmallVector<llvm::unittest::TempDir> Dirs;
+
+  llvm::SmallVector<std::unique_ptr<llvm::unittest::cas::MockEnv>> Envs;
+
   std::unique_ptr<llvm::cas::ObjectStore> createObjectStore() {
     auto TD = GetParam()(++(*NextCASIndex));
+    if (TD.Temp)
+      Dirs.push_back(std::move(*TD.Temp));
     return std::move(TD.CAS);
   }
   std::unique_ptr<llvm::cas::ActionCache> createActionCache() {
     auto TD = GetParam()(++(*NextCASIndex));
+    if (TD.Temp)
+      Dirs.push_back(std::move(*TD.Temp));
     return std::move(TD.Cache);
   }
-  void SetUp() override { NextCASIndex = 0; }
-  void TearDown() override { NextCASIndex = std::nullopt; }
+
+  void SetUp() override {
+    NextCASIndex = 0;
+    setMaxOnDiskCASMappingSize();
+  }
+
+  void TearDown() override {
+    NextCASIndex = std::nullopt;
+    Dirs.clear();
+    Envs.clear();
+  }
 };
 
 #endif
diff --git a/llvm/unittests/CAS/CMakeLists.txt b/llvm/unittests/CAS/CMakeLists.txt
index da469f7fccb5a..91e49be770745 100644
--- a/llvm/unittests/CAS/CMakeLists.txt
+++ b/llvm/unittests/CAS/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(ONDISK_CAS_TEST_SOURCES
+  BuiltinUnifiedCASDatabasesTest.cpp
   OnDiskGraphDBTest.cpp
   OnDiskDataAllocatorTest.cpp
   OnDiskKeyValueDBTest.cpp
   OnDiskTrieRawHashMapTest.cpp
   ProgramTest.cpp
+  UnifiedOnDiskCacheTest.cpp
   )
 
 set(LLVM_OPTIONAL_SOURCES
diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp
index 54083fdb408f6..b43ae33d74127 100644
--- a/llvm/unittests/CAS/ObjectStoreTest.cpp
+++ b/llvm/unittests/CAS/ObjectStoreTest.cpp
@@ -1,4 +1,4 @@
-//===- ObjectStoreTest.cpp ------------------------------------------------===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -76,7 +76,7 @@ multiline text multiline text multiline text multiline text multiline text)",
 
   // Run validation on all CASIDs.
   for (int I = 0, E = IDs.size(); I != E; ++I)
-    ASSERT_THAT_ERROR(CAS1->validate(IDs[I]), Succeeded());
+    ASSERT_THAT_ERROR(CAS1->validateObject(IDs[I]), Succeeded());
 
   // Check that the blobs can be retrieved multiple times.
   for (int I = 0, E = IDs.size(); I != E; ++I) {
@@ -120,15 +120,15 @@ TEST_P(CASTest, BlobsBig) {
     std::optional<CASID> ID2;
     ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID1), Succeeded());
     ASSERT_THAT_ERROR(CAS->createProxy({}, String1).moveInto(ID2), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded());
     ASSERT_EQ(ID1, ID2);
 
     String1.append(String2);
     ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID1), Succeeded());
     ASSERT_THAT_ERROR(CAS->createProxy({}, String2).moveInto(ID2), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID1), Succeeded());
-    ASSERT_THAT_ERROR(CAS->validate(*ID2), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID1), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(*ID2), Succeeded());
     ASSERT_EQ(ID1, ID2);
     String2.append(String1);
   }
@@ -176,10 +176,11 @@ multiline text multiline text multiline text multiline text multiline text)",
 
     // Check basic printing of IDs.
     IDs.push_back(CAS1->getID(*Node));
-    auto ID = CAS1->getID(Nodes.back());
-    EXPECT_EQ(ID.toString(), IDs.back().toString());
-    EXPECT_EQ(*Node, Nodes.back());
-    EXPECT_EQ(ID, IDs.back());
+    EXPECT_EQ(IDs.back().toString(), IDs.back().toString());
+    EXPECT_EQ(Nodes.front(), Nodes.front());
+    EXPECT_EQ(Nodes.back(), Nodes.back());
+    EXPECT_EQ(IDs.front(), IDs.front());
+    EXPECT_EQ(IDs.back(), IDs.back());
     if (Nodes.size() <= 1)
       continue;
     EXPECT_NE(Nodes.front(), Nodes.back());
@@ -266,7 +267,7 @@ TEST_P(CASTest, NodesBig) {
   }
 
   for (auto ID : CreatedNodes)
-    ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded());
+    ASSERT_THAT_ERROR(CAS->validateObject(CAS->getID(ID)), Succeeded());
 }
 
 #if LLVM_ENABLE_THREADS
@@ -332,17 +333,124 @@ static void testBlobsParallel1(ObjectStore &CAS, uint64_t BlobSize) {
 }
 
 TEST_P(CASTest, BlobsParallel) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   uint64_t Size = 1ULL * 1024;
   ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
 }
 
 #ifdef EXPENSIVE_CHECKS
 TEST_P(CASTest, BlobsBigParallel) {
-  std::shared_ptr<ObjectStore> CAS = createObjectStore();
+  std::unique_ptr<ObjectStore> CAS = createObjectStore();
   // 100k is large enough to be standalone files in our on-disk cas.
   uint64_t Size = 100ULL * 1024;
   ASSERT_NO_FATAL_FAILURE(testBlobsParallel1(*CAS, Size));
 }
 #endif // EXPENSIVE_CHECKS
+
+#ifndef _WIN32 // create_link won't work for directories on Windows
+TEST_F(OnDiskCASTest, OnDiskCASBlobsParallelMultiCAS) {
+  // This test intentionally uses symlinked paths to the same CAS to subvert the
+  // shared memory mappings that would normally be created within a single
+  // process. This breaks the lock file guarantees, so we must be careful not
+  // to create or destroy the CAS objects concurrently, which is when the locks
+  // are normally important.
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+            std::error_code());
+
+  std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+                    Succeeded());
+
+  uint64_t Size = 1ULL * 1024;
+  ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+}
+
+TEST_F(OnDiskCASTest, OnDiskCASBlobsBigParallelMultiCAS) {
+  // See comment in BlobsParallelMultiCAS.
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  ASSERT_EQ(sys::fs::create_directory(Temp.path("real_cas")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas1")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas2")),
+            std::error_code());
+  ASSERT_EQ(sys::fs::create_link("real_cas", Temp.path("sym_cas3")),
+            std::error_code());
+
+  std::unique_ptr<ObjectStore> CAS1, CAS2, CAS3, CAS4;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("real_cas")).moveInto(CAS1),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas1")).moveInto(CAS2),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas2")).moveInto(CAS3),
+                    Succeeded());
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path("sym_cas3")).moveInto(CAS4),
+                    Succeeded());
+
+  // 100k is large enough to be standalone files in our on-disk cas.
+  uint64_t Size = 100ULL * 1024;
+  ASSERT_NO_FATAL_FAILURE(testBlobsParallel(*CAS1, *CAS2, *CAS3, *CAS4, Size));
+}
+#endif // _WIN32
 #endif // LLVM_ENABLE_THREADS
+
+TEST_F(OnDiskCASTest, OnDiskCASDiskSize) {
+  unittest::TempDir Temp("on-disk-cas", /*Unique=*/true);
+  std::unique_ptr<ObjectStore> CAS;
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+
+  uint64_t MaxSize = 100 * 1024 * 1024;
+
+  // Check that we map the files to the correct size.
+  auto CheckFileSizes = [&](bool Mapped) {
+    bool FoundIndex = false, FoundData = false;
+    std::error_code EC;
+    for (sys::fs::directory_iterator I(Temp.path(), EC), E; I != E && !EC;
+         I.increment(EC)) {
+      StringRef Filename = sys::path::filename(I->path());
+      if (Filename.starts_with("index.") && !Filename.ends_with(".shared")) {
+        FoundIndex = true;
+        ASSERT_TRUE(I->status());
+        if (Mapped)
+          EXPECT_EQ(I->status()->getSize(), MaxSize);
+        else
+          EXPECT_LT(I->status()->getSize(), MaxSize);
+      }
+      if (Filename.starts_with("data.") && !Filename.ends_with(".shared")) {
+        FoundData = true;
+        ASSERT_TRUE(I->status());
+        if (Mapped)
+          EXPECT_EQ(I->status()->getSize(), MaxSize);
+        else
+          EXPECT_LT(I->status()->getSize(), MaxSize);
+      }
+    }
+    ASSERT_TRUE(FoundIndex);
+    ASSERT_TRUE(FoundData);
+  };
+
+  // Check that we have the full mapping size when the CAS is open.
+  CheckFileSizes(/*Mapped=*/true);
+  CAS.reset();
+  // Check that the CAS is shrunk to a smaller size.
+  CheckFileSizes(/*Mapped=*/false);
+
+  // Repeat the checks when starting from an existing CAS.
+  ASSERT_THAT_ERROR(createOnDiskCAS(Temp.path()).moveInto(CAS), Succeeded());
+  CheckFileSizes(/*Mapped=*/true);
+  CAS.reset();
+  CheckFileSizes(/*Mapped=*/false);
+}
diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
index 89f93e08366c9..48a1830f9b219 100644
--- a/llvm/unittests/CAS/OnDiskCommonUtils.h
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -12,6 +12,8 @@
 
 #include "llvm/CAS/BuiltinObjectHasher.h"
 #include "llvm/CAS/OnDiskGraphDB.h"
+#include "llvm/CAS/OnDiskKeyValueDB.h"
+#include "llvm/CAS/UnifiedOnDiskCache.h"
 #include "llvm/Support/BLAKE3.h"
 #include "llvm/Testing/Support/Error.h"
 
@@ -58,6 +60,25 @@ inline Expected<ObjectID> store(OnDiskGraphDB &DB, StringRef Data,
   return ID;
 }
 
+inline Expected<ObjectID> cachePut(OnDiskKeyValueDB &DB, ArrayRef<uint8_t> Key,
+                                   ObjectID ID) {
+  auto Value = UnifiedOnDiskCache::getValueFromObjectID(ID);
+  auto Result = DB.put(Key, Value);
+  if (!Result)
+    return Result.takeError();
+  return UnifiedOnDiskCache::getObjectIDFromValue(*Result);
+}
+
+inline Expected<std::optional<ObjectID>> cacheGet(OnDiskKeyValueDB &DB,
+                                                  ArrayRef<uint8_t> Key) {
+  auto Result = DB.get(Key);
+  if (!Result)
+    return Result.takeError();
+  if (!*Result)
+    return std::nullopt;
+  return UnifiedOnDiskCache::getObjectIDFromValue(**Result);
+}
+
 inline Error printTree(OnDiskGraphDB &DB, ObjectID ID, raw_ostream &OS,
                        unsigned Indent = 0) {
   std::optional<ondisk::ObjectHandle> Obj;
diff --git a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
index 3c2e96318a5ed..e9c73bfb6c8d3 100644
--- a/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
+++ b/llvm/unittests/CAS/OnDiskGraphDBTest.cpp
@@ -102,7 +102,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInSingleNode) {
   std::unique_ptr<OnDiskGraphDB> DB;
   ASSERT_THAT_ERROR(
       OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
-                          std::move(UpstreamDB),
+                          UpstreamDB.get(),
                           OnDiskGraphDB::FaultInPolicy::SingleNode)
           .moveInto(DB),
       Succeeded());
@@ -208,7 +208,7 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInFullTree) {
   unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
   std::unique_ptr<OnDiskGraphDB> DB;
   ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3", sizeof(HashType),
-                                        std::move(UpstreamDB),
+                                        UpstreamDB.get(),
                                         OnDiskGraphDB::FaultInPolicy::FullTree)
                         .moveInto(DB),
                     Succeeded());
@@ -264,14 +264,14 @@ TEST_F(OnDiskCASTest, OnDiskGraphDBFaultInPolicyConflict) {
     unittest::TempDir Temp("ondiskcas", /*Unique=*/true);
     std::unique_ptr<OnDiskGraphDB> DB;
     ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
-                                          sizeof(HashType),
-                                          std::move(UpstreamDB), Policy1)
+                                          sizeof(HashType), UpstreamDB.get(),
+                                          Policy1)
                           .moveInto(DB),
                       Succeeded());
     DB.reset();
     ASSERT_THAT_ERROR(OnDiskGraphDB::open(Temp.path(), "blake3",
-                                          sizeof(HashType),
-                                          std::move(UpstreamDB), Policy2)
+                                          sizeof(HashType), UpstreamDB.get(),
+                                          Policy2)
                           .moveInto(DB),
                       Failed());
   };
diff --git a/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
new file mode 100644
index 0000000000000..09aebc2d4bc19
--- /dev/null
+++ b/llvm/unittests/CAS/UnifiedOnDiskCacheTest.cpp
@@ -0,0 +1,198 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CAS/UnifiedOnDiskCache.h"
+#include "CASTestConfig.h"
+#include "OnDiskCommonUtils.h"
+#include "llvm/Testing/Support/Error.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::cas;
+using namespace llvm::cas::ondisk;
+using namespace llvm::unittest::cas;
+
+/// Visits all the files of a directory recursively and returns the sum of their
+/// sizes.
+static Expected<size_t> countFileSizes(StringRef Path) {
+  size_t TotalSize = 0;
+  std::error_code EC;
+  for (sys::fs::directory_iterator DirI(Path, EC), DirE; !EC && DirI != DirE;
+       DirI.increment(EC)) {
+    if (DirI->type() == sys::fs::file_type::directory_file) {
+      Expected<size_t> Subsize = countFileSizes(DirI->path());
+      if (!Subsize)
+        return Subsize.takeError();
+      TotalSize += *Subsize;
+      continue;
+    }
+    ErrorOr<sys::fs::basic_file_status> Stat = DirI->status();
+    if (!Stat)
+      return createFileError(DirI->path(), Stat.getError());
+    TotalSize += Stat->getSize();
+  }
+  if (EC)
+    return createFileError(Path, EC);
+  return TotalSize;
+}
+
+TEST_F(OnDiskCASTest, UnifiedOnDiskCacheTest) {
+  unittest::TempDir Temp("ondisk-unified", /*Unique=*/true);
+  std::unique_ptr<UnifiedOnDiskCache> UniDB;
+
+  const uint64_t SizeLimit = 1024ull * 64;
+  auto reopenDB = [&]() {
+    UniDB.reset();
+    ASSERT_THAT_ERROR(UnifiedOnDiskCache::open(Temp.path(), SizeLimit, "blake3",
+                                               sizeof(HashType))
+                          .moveInto(UniDB),
+                      Succeeded());
+  };
+
+  reopenDB();
+
+  HashType RootHash;
+  HashType OtherHash;
+  HashType Key1Hash;
+  HashType Key2Hash;
+  {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID1;
+    ASSERT_THAT_ERROR(store(DB, "1", {}).moveInto(ID1), Succeeded());
+    std::optional<ObjectID> ID2;
+    ASSERT_THAT_ERROR(store(DB, "2", {}).moveInto(ID2), Succeeded());
+    std::optional<ObjectID> IDRoot;
+    ASSERT_THAT_ERROR(store(DB, "root", {*ID1, *ID2}).moveInto(IDRoot),
+                      Succeeded());
+    ArrayRef<uint8_t> Digest = DB.getDigest(*IDRoot);
+    ASSERT_EQ(Digest.size(), RootHash.size());
+    llvm::copy(Digest, RootHash.data());
+
+    std::optional<ObjectID> IDOther;
+    ASSERT_THAT_ERROR(store(DB, "other", {}).moveInto(IDOther), Succeeded());
+    Digest = DB.getDigest(*IDOther);
+    ASSERT_EQ(Digest.size(), OtherHash.size());
+    llvm::copy(Digest, OtherHash.data());
+
+    Key1Hash = digest("key1");
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(
+        cachePut(UniDB->getKeyValueDB(), Key1Hash, *IDRoot).moveInto(Val),
+        Succeeded());
+    EXPECT_EQ(IDRoot, Val);
+
+    Key2Hash = digest("key2");
+    std::optional<ObjectID> KeyID;
+    ASSERT_THAT_ERROR(DB.getReference(Key2Hash).moveInto(KeyID), Succeeded());
+    ASSERT_THAT_ERROR(cachePut(UniDB->getKeyValueDB(),
+                               UniDB->getGraphDB().getDigest(*KeyID), *ID1)
+                          .moveInto(Val),
+                      Succeeded());
+  }
+
+  auto checkTree = [&](const HashType &Digest, StringRef ExpectedTree) {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(DB.getReference(Digest).moveInto(ID), Succeeded());
+    std::string PrintedTree;
+    raw_string_ostream OS(PrintedTree);
+    ASSERT_THAT_ERROR(printTree(DB, *ID, OS), Succeeded());
+    EXPECT_EQ(PrintedTree, ExpectedTree);
+  };
+  auto checkRootTree = [&]() {
+    return checkTree(RootHash, "root\n  1\n  2\n");
+  };
+
+  auto checkKey = [&](const HashType &Key, StringRef ExpectedData) {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key).moveInto(Val),
+                      Succeeded());
+
+    ASSERT_TRUE(Val.has_value());
+    std::optional<ondisk::ObjectHandle> Obj;
+    ASSERT_THAT_ERROR(DB.load(*Val).moveInto(Obj), Succeeded());
+    EXPECT_EQ(toStringRef(DB.getObjectData(*Obj)), ExpectedData);
+  };
+
+  checkRootTree();
+  checkTree(OtherHash, "other\n");
+  checkKey(Key1Hash, "root");
+  checkKey(Key2Hash, "1");
+
+  auto storeBigObject = [&](unsigned Index) {
+    SmallString<1000> Buf;
+    Buf.append(970, 'a');
+    raw_svector_ostream(Buf) << Index;
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(store(UniDB->getGraphDB(), Buf, {}).moveInto(ID),
+                      Succeeded());
+  };
+
+  uint64_t PrevStoreSize = UniDB->getStorageSize();
+  unsigned Index = 0;
+  while (!UniDB->hasExceededSizeLimit()) {
+    storeBigObject(Index++);
+  }
+  EXPECT_GT(UniDB->getStorageSize(), PrevStoreSize);
+  UniDB->setSizeLimit(SizeLimit * 2);
+  EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+  UniDB->setSizeLimit(SizeLimit);
+  EXPECT_TRUE(UniDB->hasExceededSizeLimit());
+
+  reopenDB();
+
+  EXPECT_FALSE(UniDB->hasExceededSizeLimit());
+  EXPECT_FALSE(UniDB->needsGarbageCollection());
+
+  checkRootTree();
+  checkKey(Key1Hash, "root");
+
+  while (!UniDB->hasExceededSizeLimit()) {
+    storeBigObject(Index++);
+  }
+  PrevStoreSize = UniDB->getStorageSize();
+  ASSERT_THAT_ERROR(UniDB->close(), Succeeded());
+  EXPECT_TRUE(UniDB->needsGarbageCollection());
+
+  reopenDB();
+  EXPECT_TRUE(UniDB->needsGarbageCollection());
+
+  std::optional<size_t> DirSizeBefore;
+  ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeBefore),
+                    Succeeded());
+
+  ASSERT_THAT_ERROR(UnifiedOnDiskCache::collectGarbage(Temp.path()),
+                    Succeeded());
+
+  std::optional<size_t> DirSizeAfter;
+  ASSERT_THAT_ERROR(countFileSizes(Temp.path()).moveInto(DirSizeAfter),
+                    Succeeded());
+  EXPECT_LT(*DirSizeAfter, *DirSizeBefore);
+
+  reopenDB();
+  EXPECT_FALSE(UniDB->needsGarbageCollection());
+
+  checkRootTree();
+  checkKey(Key1Hash, "root");
+
+  EXPECT_LT(UniDB->getStorageSize(), PrevStoreSize);
+
+  // 'Other' tree and 'Key2' got garbage-collected.
+  {
+    OnDiskGraphDB &DB = UniDB->getGraphDB();
+    std::optional<ObjectID> ID;
+    ASSERT_THAT_ERROR(DB.getReference(OtherHash).moveInto(ID), Succeeded());
+    EXPECT_FALSE(DB.containsObject(*ID));
+    std::optional<ObjectID> Val;
+    ASSERT_THAT_ERROR(cacheGet(UniDB->getKeyValueDB(), Key2Hash).moveInto(Val),
+                      Succeeded());
+    EXPECT_FALSE(Val.has_value());
+  }
+}
diff --git a/llvm/unittests/Support/Casting.cpp b/llvm/unittests/Support/Casting.cpp
index 790675083614b..0df8b9fcab452 100644
--- a/llvm/unittests/Support/Casting.cpp
+++ b/llvm/unittests/Support/Casting.cpp
@@ -561,6 +561,47 @@ TEST(CastingTest, assertion_check_unique_ptr) {
       << "Invalid cast of const ref did not cause an abort()";
 }
 
+TEST(Casting, StaticCastPredicate) {
+  uint32_t Value = 1;
+
+  static_assert(
+      std::is_same_v<decltype(StaticCastTo<uint64_t>(Value)), uint64_t>);
+}
+
+TEST(Casting, LLVMRTTIPredicates) {
+  struct Base {
+    enum Kind { BK_Base, BK_Derived };
+    const Kind K;
+    Base(Kind K = BK_Base) : K(K) {}
+    Kind getKind() const { return K; }
+    virtual ~Base() = default;
+  };
+
+  struct Derived : Base {
+    Derived() : Base(BK_Derived) {}
+    static bool classof(const Base *B) { return B->getKind() == BK_Derived; }
+    bool Field = false;
+  };
+
+  Base B;
+  Derived D;
+  Base *BD = &D;
+  Base *Null = nullptr;
+
+  // Pointers.
+  EXPECT_EQ(DynCastTo<Derived>(BD), &D);
+  EXPECT_EQ(CastTo<Derived>(BD), &D);
+  EXPECT_EQ(DynCastTo<Derived>(&B), nullptr);
+  EXPECT_EQ(CastIfPresentTo<Derived>(BD), &D);
+  EXPECT_EQ(CastIfPresentTo<Derived>(Null), nullptr);
+  EXPECT_EQ(DynCastIfPresentTo<Derived>(BD), &D);
+  EXPECT_EQ(DynCastIfPresentTo<Derived>(Null), nullptr);
+
+  Base &R = D;
+  CastTo<Derived>(R).Field = true;
+  EXPECT_TRUE(D.Field);
+}
+
 } // end namespace assertion_checks
 #endif
 } // end namespace
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 4b53cc3d6e516..4908eda16e002 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -1153,7 +1153,7 @@ TEST(Local, ExpressionForConstant) {
   IntegerType *Int1Ty = Type::getInt1Ty(Context);
   Expr = createExpression(ConstantInt::getTrue(Context), Int1Ty);
   EXPECT_NE(Expr, nullptr);
-  EXPECT_EQ(Expr->getElement(1), 18446744073709551615U);
+  EXPECT_EQ(Expr->getElement(1), 1U);
 
   Expr = createExpression(ConstantInt::getFalse(Context), Int1Ty);
   EXPECT_NE(Expr, nullptr);
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 46802826fe090..169114ed6c310 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -326,22 +326,18 @@ TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
 
 class VPIRVerifierTest : public VPlanTestIRBase {};
 
-TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
+TEST_F(VPIRVerifierTest, testVerifyIRPhiInScalarHeaderVPIRBB) {
   const char *ModuleString =
       "define void @f(ptr %A, i64 %N) {\n"
       "entry:\n"
       "  br label %loop\n"
       "loop:\n"
       "  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n"
-      "  %arr.idx = getelementptr inbounds i32, ptr %A, i64 %iv\n"
-      "  %l1 = load i32, ptr %arr.idx, align 4\n"
-      "  %res = add i32 %l1, 10\n"
-      "  store i32 %res, ptr %arr.idx, align 4\n"
       "  %iv.next = add i64 %iv, 1\n"
       "  %exitcond = icmp ne i64 %iv.next, %N\n"
       "  br i1 %exitcond, label %loop, label %for.end\n"
       "for.end:\n"
-      "  %p = phi i32 [ %l1, %loop ]\n"
+      "  %p = phi i64 [ %iv, %loop ]\n"
       "  ret void\n"
       "}\n";
 
@@ -351,7 +347,48 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
   BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
   auto Plan = buildVPlan(LoopHeader);
 
-  Plan->getExitBlocks()[0]->front().addOperand(Plan->getConstantInt(32, 0));
+#if GTEST_HAS_STREAM_REDIRECTION
+  ::testing::internal::CaptureStderr();
+#endif
+  EXPECT_FALSE(verifyVPlanIsValid(*Plan));
+#if GTEST_HAS_STREAM_REDIRECTION
+  EXPECT_STREQ(
+      "Phi-like recipe with different number of operands and predecessors.\n",
+      ::testing::internal::GetCapturedStderr().c_str());
+#endif
+}
+
+TEST_F(VPIRVerifierTest, testVerifyIRPhiInExitVPIRBB) {
+  const char *ModuleString =
+      "define void @f(ptr %A, i64 %N) {\n"
+      "entry:\n"
+      "  br label %loop\n"
+      "loop:\n"
+      "  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]\n"
+      "  %iv.next = add i64 %iv, 1\n"
+      "  %exitcond = icmp ne i64 %iv.next, %N\n"
+      "  br i1 %exitcond, label %loop, label %for.end\n"
+      "for.end:\n"
+      "  %p = phi i64 [ %iv, %loop ]\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("f");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildVPlan(LoopHeader);
+
+  // Create a definition in the vector loop header that will be used by the phi.
+  auto *HeaderBlock =
+      cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getEntry());
+  VPInstruction *DefI =
+      new VPInstruction(VPInstruction::ExtractLastElement,
+                        {HeaderBlock->front().getVPSingleValue()});
+  DefI->insertBefore(Plan->getMiddleBlock()->getTerminator());
+  Plan->getExitBlocks()[0]->front().addOperand(DefI);
+  VPValue *Zero = Plan->getConstantInt(32, 0);
+  Plan->getScalarHeader()->front().addOperand(Zero);
 
 #if GTEST_HAS_STREAM_REDIRECTION
   ::testing::internal::CaptureStderr();
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index cd866469792a2..ff894853b9771 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -407,6 +407,8 @@ void CodeGenIntrinsic::setProperty(const Record *R) {
     hasSideEffects = true;
   else if (R->getName() == "IntrStrictFP")
     isStrictFP = true;
+  else if (R->getName() == "IntrNoCreateUndefOrPoison")
+    isNoCreateUndefOrPoison = true;
   else if (R->isSubClassOf("NoCapture")) {
     unsigned ArgNo = R->getValueAsInt("ArgNo");
     addArgAttribute(ArgNo, NoCapture);
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
index 2e86149514f46..15e803c4feba1 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.h
@@ -114,6 +114,9 @@ struct CodeGenIntrinsic {
   // True if the intrinsic is marked as strictfp.
   bool isStrictFP = false;
 
+  // True if the intrinsic is marked as IntrNoCreateUndefOrPoison.
+  bool isNoCreateUndefOrPoison = false;
+
   enum ArgAttrKind {
     NoCapture,
     NoAlias,
diff --git a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
index 3c6ff1132230b..d33bf45595e2e 100644
--- a/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/DirectiveEmitter.cpp
@@ -81,6 +81,7 @@ static void generateEnumExports(ArrayRef<const Record *> Records,
     std::string N = getIdentifierName(R, Prefix);
     OS << "constexpr auto " << N << " = " << Enum << "::" << N << ";\n";
   }
+  OS << "\n";
 }
 
 // Generate enum class. Entries are emitted in the order in which they appear
@@ -88,7 +89,6 @@ static void generateEnumExports(ArrayRef<const Record *> Records,
 static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
                               StringRef Enum, StringRef Prefix,
                               bool ExportEnums) {
-  OS << "\n";
   OS << "enum class " << Enum << " {\n";
   if (!Records.empty()) {
     std::string N;
@@ -104,17 +104,15 @@ static void generateEnumClass(ArrayRef<const Record *> Records, raw_ostream &OS,
   OS << "};\n";
   OS << "\n";
   OS << "static constexpr std::size_t " << Enum
-     << "_enumSize = " << Records.size() << ";\n";
+     << "_enumSize = " << Records.size() << ";\n\n";
 
   // Make the enum values available in the defined namespace. This allows us to
   // write something like Enum_X if we have a `using namespace <CppNamespace>`.
   // At the same time we do not loose the strong type guarantees of the enum
   // class, that is we cannot pass an unsigned as Directive without an explicit
   // cast.
-  if (ExportEnums) {
-    OS << "\n";
+  if (ExportEnums)
     generateEnumExports(Records, OS, Enum, Prefix);
-  }
 }
 
 // Generate enum class with values corresponding to different bit positions.
@@ -127,7 +125,6 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records,
   StringRef Type = Records.size() <= 32 ? "uint32_t" : "uint64_t";
   StringRef TypeSuffix = Records.size() <= 32 ? "U" : "ULL";
 
-  OS << "\n";
   OS << "enum class " << Enum << " : " << Type << " {\n";
   std::string LastName;
   for (auto [I, R] : llvm::enumerate(Records)) {
@@ -138,17 +135,15 @@ static void generateEnumBitmask(ArrayRef<const Record *> Records,
   OS << "};\n";
   OS << "\n";
   OS << "static constexpr std::size_t " << Enum
-     << "_enumSize = " << Records.size() << ";\n";
+     << "_enumSize = " << Records.size() << ";\n\n";
 
   // Make the enum values available in the defined namespace. This allows us to
   // write something like Enum_X if we have a `using namespace <CppNamespace>`.
   // At the same time we do not loose the strong type guarantees of the enum
   // class, that is we cannot pass an unsigned as Directive without an explicit
   // cast.
-  if (ExportEnums) {
-    OS << "\n";
+  if (ExportEnums)
     generateEnumExports(Records, OS, Enum, Prefix);
-  }
 }
 
 // Generate enums for values that clauses can take.
@@ -170,7 +165,6 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records,
       return;
     }
 
-    OS << "\n";
     OS << "enum class " << Enum << " {\n";
     for (const EnumVal Val : ClauseVals)
       OS << "  " << Val.getRecordName() << "=" << Val.getValue() << ",\n";
@@ -182,6 +176,7 @@ static void generateClauseEnumVal(ArrayRef<const Record *> Records,
         OS << "constexpr auto " << CV->getName() << " = " << Enum
            << "::" << CV->getName() << ";\n";
       }
+      OS << "\n";
       EnumHelperFuncs += (Twine("LLVM_ABI ") + Twine(Enum) + Twine(" get") +
                           Twine(Enum) + Twine("(StringRef Str);\n"))
                              .str();
@@ -284,7 +279,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   NamespaceEmitter DirLangNS(OS, DirLang.getCppNamespace());
 
   if (DirLang.hasEnableBitmaskEnumInNamespace())
-    OS << "\nLLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n";
+    OS << "LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();\n\n";
 
   // Emit Directive associations
   std::vector<const Record *> Associations;
@@ -315,7 +310,6 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   generateClauseEnumVal(DirLang.getClauses(), OS, DirLang, EnumHelperFuncs);
 
   // Generic function signatures
-  OS << "\n";
   OS << "// Enumeration helper functions\n";
 
   OS << "LLVM_ABI std::pair<Directive, directive::VersionRange> get" << Lang
@@ -353,10 +347,7 @@ static void emitDirectivesDecl(const RecordKeeper &Records, raw_ostream &OS) {
   OS << "LLVM_ABI Association getDirectiveAssociation(Directive D);\n";
   OS << "LLVM_ABI Category getDirectiveCategory(Directive D);\n";
   OS << "LLVM_ABI SourceLanguage getDirectiveLanguages(Directive D);\n";
-  if (EnumHelperFuncs.length() > 0) {
-    OS << EnumHelperFuncs;
-    OS << "\n";
-  }
+  OS << EnumHelperFuncs;
 
   DirLangNS.close();
 
diff --git a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
index 75dffb18fca5a..452d2b08f25c3 100644
--- a/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
+++ b/llvm/utils/TableGen/Basic/IntrinsicEmitter.cpp
@@ -421,7 +421,8 @@ static bool compareFnAttributes(const CodeGenIntrinsic *L,
     return std::tie(I->canThrow, I->isNoDuplicate, I->isNoMerge, I->isNoReturn,
                     I->isNoCallback, I->isNoSync, I->isNoFree, I->isWillReturn,
                     I->isCold, I->isConvergent, I->isSpeculatable,
-                    I->hasSideEffects, I->isStrictFP);
+                    I->hasSideEffects, I->isStrictFP,
+                    I->isNoCreateUndefOrPoison);
   };
 
   auto TieL = TieBoolAttributes(L);
@@ -446,7 +447,8 @@ static bool hasFnAttributes(const CodeGenIntrinsic &Int) {
   return !Int.canThrow || Int.isNoReturn || Int.isNoCallback || Int.isNoSync ||
          Int.isNoFree || Int.isWillReturn || Int.isCold || Int.isNoDuplicate ||
          Int.isNoMerge || Int.isConvergent || Int.isSpeculatable ||
-         Int.isStrictFP || getEffectiveME(Int) != MemoryEffects::unknown();
+         Int.isStrictFP || Int.isNoCreateUndefOrPoison ||
+         getEffectiveME(Int) != MemoryEffects::unknown();
 }
 
 namespace {
@@ -605,6 +607,8 @@ static AttributeSet getIntrinsicFnAttributeSet(LLVMContext &C, unsigned ID) {
       addAttribute("Speculatable");
     if (Int.isStrictFP)
       addAttribute("StrictFP");
+    if (Int.isNoCreateUndefOrPoison)
+      addAttribute("NoCreateUndefOrPoison");
 
     const MemoryEffects ME = getEffectiveME(Int);
     if (ME != MemoryEffects::unknown()) {
diff --git a/llvm/utils/UpdateTestChecks/asm.py b/llvm/utils/UpdateTestChecks/asm.py
index 469e27facedb0..82377862885c8 100644
--- a/llvm/utils/UpdateTestChecks/asm.py
+++ b/llvm/utils/UpdateTestChecks/asm.py
@@ -51,9 +51,9 @@ class string:
 )
 
 ASM_FUNCTION_AMDGPU_RE = re.compile(
-    r"\.type\s+_?(?P<func>[^,\n]+),@function\n"
+    r"\.type\s+(_|\.L)?(?P<func>[^,\n]+),@function\n"
     r"(^\s*\.amdgpu_hsa_kernel (?P=func)\n)?"
-    r'^_?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n'
+    r'^(_|\.L)?(?P=func):(?:[ \t]*;+[ \t]*@"?(?P=func)"?)?\n'
     r"(?P<body>.*?)\n"  # (body of the function)
     # This list is incomplete
     r"^\s*(\.Lfunc_end[0-9]+:\n|\.section)",
@@ -576,6 +576,7 @@ def get_run_handler(triple):
         "armv7-apple-ios": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE),
         "armv7-apple-darwin": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
         "armv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
+        "thumbv7k-apple-watchos": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_DARWIN_RE),
         "thumb": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),
         "thumb-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE),
         "thumbv5-macho": (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE),
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 2dad16a8eebb7..baa0377cd8b81 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -605,6 +605,7 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False):
 TRIPLE_ARG_RE = re.compile(r"-m?triple[= ]([^ ]+)")
 MARCH_ARG_RE = re.compile(r"-march[= ]([^ ]+)")
 DEBUG_ONLY_ARG_RE = re.compile(r"-debug-only[= ]([^ ]+)")
+STOP_PASS_RE = re.compile(r"-stop-(before|after)=(\w+)")
 
 IS_DEBUG_RECORD_RE = re.compile(r"^(\s+)#dbg_")
 IS_SWITCH_CASE_RE = re.compile(r"^\s+i\d+ \d+, label %\S+")
diff --git a/llvm/utils/UpdateTestChecks/mir.py b/llvm/utils/UpdateTestChecks/mir.py
index 24bb8b341d335..01ee0e19f7cb9 100644
--- a/llvm/utils/UpdateTestChecks/mir.py
+++ b/llvm/utils/UpdateTestChecks/mir.py
@@ -163,13 +163,15 @@ def add_mir_checks_for_function(
     print_fixed_stack,
     first_check_is_next,
     at_the_function_name,
+    check_indent=None,
 ):
     printed_prefixes = set()
     for run in run_list:
         for prefix in run[0]:
             if prefix in printed_prefixes:
                 break
-            if not func_dict[prefix][func_name]:
+            # func_info can be empty if there was a prefix conflict.
+            if not func_dict[prefix].get(func_name):
                 continue
             if printed_prefixes:
                 # Add some space between different check prefixes.
@@ -185,6 +187,7 @@ def add_mir_checks_for_function(
                 func_dict[prefix][func_name],
                 print_fixed_stack,
                 first_check_is_next,
+                check_indent,
             )
             break
         else:
@@ -204,6 +207,7 @@ def add_mir_check_lines(
     func_info,
     print_fixed_stack,
     first_check_is_next,
+    check_indent=None,
 ):
     func_body = str(func_info).splitlines()
     if single_bb:
@@ -220,7 +224,10 @@ def add_mir_check_lines(
     first_line = func_body[0]
     indent = len(first_line) - len(first_line.lstrip(" "))
     # A check comment, indented the appropriate amount
-    check = "{:>{}}; {}".format("", indent, prefix)
+    if check_indent is not None:
+        check = "{}; {}".format(check_indent, prefix)
+    else:
+        check = "{:>{}}; {}".format("", indent, prefix)
 
     output_lines.append("{}-LABEL: name: {}".format(check, func_name))
 
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index f280f695cd3ab..2f84999621e1b 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -32,6 +32,7 @@ static_library("bugprone") {
     "CopyConstructorInitCheck.cpp",
     "CrtpConstructorAccessibilityCheck.cpp",
     "DanglingHandleCheck.cpp",
+    "DefaultOperatorNewOnOveralignedTypeCheck.cpp",
     "DerivedMethodShadowingBaseMethodCheck.cpp",
     "DynamicStaticInitializersCheck.cpp",
     "EasilySwappableParametersCheck.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
index 3ad0a83a8fb23..ec642b6afad66 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/cert/BUILD.gn
@@ -16,7 +16,6 @@ static_library("cert") {
   ]
   sources = [
     "CERTTidyModule.cpp",
-    "DefaultOperatorNewAlignmentCheck.cpp",
     "DontModifyStdNamespaceCheck.cpp",
     "FloatLoopCounter.cpp",
     "LimitedRandomnessCheck.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
index a42f781271a21..b6c2f465a7292 100644
--- a/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/tools/lldb-dap/BUILD.gn
@@ -21,6 +21,7 @@ static_library("lib") {
   sources = [
     "Breakpoint.cpp",
     "BreakpointBase.cpp",
+    "ClientLauncher.cpp",
     "CommandPlugins.cpp",
     "DAP.cpp",
     "DAPError.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
index 5590b27ac3784..1e0e918d3370c 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
@@ -4,14 +4,17 @@ static_library("CAS") {
     "ActionCache.cpp",
     "ActionCaches.cpp",
     "BuiltinCAS.cpp",
+    "BuiltinUnifiedCASDatabases.cpp",
     "DatabaseFile.cpp",
     "InMemoryCAS.cpp",
     "MappedFileRegionArena.cpp",
     "ObjectStore.cpp",
+    "OnDiskCAS.cpp",
     "OnDiskCommon.cpp",
     "OnDiskDataAllocator.cpp",
     "OnDiskGraphDB.cpp",
     "OnDiskKeyValueDB.cpp",
     "OnDiskTrieRawHashMap.cpp",
+    "UnifiedOnDiskCache.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
index 2d9eb6814c376..b10e0e6706cc3 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
@@ -9,8 +9,10 @@ unittest("CASTests") {
   ]
   sources = [
     "ActionCacheTest.cpp",
+    "BuiltinUnifiedCASDatabasesTest.cpp",
     "CASTestConfig.cpp",
     "ObjectStoreTest.cpp",
+    "UnifiedOnDiskCacheTest.cpp",
   ]
 
   if (llvm_enable_ondisk_cas) {
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 3176b1a257434..64148c6098327 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -462,16 +462,15 @@ def executeBuiltinMkdir(cmd, cmd_shenv):
     stderr = StringIO()
     exitCode = 0
     for dir in args:
-        cwd = cmd_shenv.cwd
-        dir = to_unicode(dir) if kIsWindows else to_bytes(dir)
-        cwd = to_unicode(cwd) if kIsWindows else to_bytes(cwd)
-        if not os.path.isabs(dir):
-            dir = lit.util.abs_path_preserve_drive(os.path.join(cwd, dir))
+        dir = pathlib.Path(dir)
+        cwd = pathlib.Path(to_unicode(cmd_shenv.cwd))
+        if not dir.is_absolute():
+            dir = lit.util.abs_path_preserve_drive(cwd / dir)
         if parent:
-            lit.util.mkdir_p(dir)
+            dir.mkdir(parents=True, exist_ok=True)
         else:
             try:
-                lit.util.mkdir(dir)
+                dir.mkdir(exist_ok=True)
             except OSError as err:
                 stderr.write("Error: 'mkdir' command failed, %s\n" % str(err))
                 exitCode = 1
@@ -2411,7 +2410,7 @@ def runOnce(
         return out, err, exitCode, timeoutInfo, status
 
     # Create the output directory if it does not already exist.
-    lit.util.mkdir_p(os.path.dirname(tmpBase))
+    pathlib.Path(tmpBase).parent.mkdir(parents=True, exist_ok=True)
 
     # Re-run failed tests up to test.allowed_retries times.
     execdir = os.path.dirname(test.getExecPath())
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index ce4c3c2df3436..e4e031b3e0898 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -5,6 +5,7 @@
 import math
 import numbers
 import os
+import pathlib
 import platform
 import re
 import signal
@@ -131,48 +132,17 @@ def abs_path_preserve_drive(path):
         # Since Python 3.8, os.path.realpath resolves sustitute drives,
         # so we should not use it. In Python 3.7, os.path.realpath
         # was implemented as os.path.abspath.
+        if isinstance(path, pathlib.Path):
+            return path.absolute()
         return os.path.abspath(path)
     else:
         # On UNIX, the current directory always has symbolic links resolved,
         # so any program accepting relative paths cannot preserve symbolic
         # links in paths and we should always use os.path.realpath.
+        if isinstance(path, pathlib.Path):
+            return path.resolve()
         return os.path.realpath(path)
 
-def mkdir(path):
-    try:
-        if platform.system() == "Windows":
-            from ctypes import windll
-            from ctypes import GetLastError, WinError
-
-            path = os.path.abspath(path)
-            # Make sure that the path uses backslashes here, in case
-            # python would have happened to use forward slashes, as the
-            # NT path format only supports backslashes.
-            path = path.replace("/", "\\")
-            NTPath = to_unicode(r"\\?\%s" % path)
-            if not windll.kernel32.CreateDirectoryW(NTPath, None):
-                raise WinError(GetLastError())
-        else:
-            os.mkdir(path)
-    except OSError:
-        e = sys.exc_info()[1]
-        # ignore EEXIST, which may occur during a race condition
-        if e.errno != errno.EEXIST:
-            raise
-
-
-def mkdir_p(path):
-    """mkdir_p(path) - Make the "path" directory, if it does not exist; this
-    will also make directories for any missing parent directories."""
-    if not path or os.path.exists(path):
-        return
-
-    parent = os.path.dirname(path)
-    if parent != path:
-        mkdir_p(parent)
-
-    mkdir(path)
-
 
 def listdir_files(dirname, suffixes=None, exclude_filenames=None, prefixes=None):
     """Yields files in a directory.
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir1.input/empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty b/llvm/utils/lit/tests/Inputs/shtest-glob/example_dir2.input/empty
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
index d1329f5dbfaae..71972411bc4cf 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
+++ b/llvm/utils/lit/tests/Inputs/shtest-glob/glob-mkdir.txt
@@ -1,2 +1,7 @@
 ## Tests glob pattern handling in the mkdir command.
+
+## This mkdir should fail because the `example_file*.input`s are regular files.
 # RUN: not mkdir %S/example_file*.input
+
+## This mkdir should succeed (so RUN should fail) because the `example_dir*.input`s that already exist are directories.
+# RUN: not mkdir %S/example_dir*.input
diff --git a/llvm/utils/lit/tests/shtest-glob.py b/llvm/utils/lit/tests/shtest-glob.py
index ae90f31907d49..aa4705b634a7d 100644
--- a/llvm/utils/lit/tests/shtest-glob.py
+++ b/llvm/utils/lit/tests/shtest-glob.py
@@ -1,12 +1,13 @@
 ## Tests glob pattern handling in echo command.
 
 # RUN: not %{lit} -a -v %{inputs}/shtest-glob \
-# RUN: | FileCheck -dump-input=fail -match-full-lines %s
-#
+# RUN: | FileCheck -dump-input=fail -match-full-lines --implicit-check-not=Error: %s
 # END.
 
 # CHECK: UNRESOLVED: shtest-glob :: glob-echo.txt ({{[^)]*}})
 # CHECK: TypeError: string argument expected, got 'GlobItem'
 
-# CHECK: FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
-# CHECK: # error: command failed with exit status: 1
+# CHECK:      FAIL: shtest-glob :: glob-mkdir.txt ({{[^)]*}})
+# CHECK:      # | Error: 'mkdir' command failed, {{.*}}example_file1.input'
+# CHECK-NEXT: # | Error: 'mkdir' command failed, {{.*}}example_file2.input'
+# CHECK:      # error: command failed with exit status: 1
diff --git a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
index 286fd3d7e173e..d81cde0159792 100644
--- a/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
+++ b/llvm/utils/lit/tests/shtest-ulimit-nondarwin.py
@@ -2,7 +2,7 @@
 
 # ulimit does not work on non-POSIX platforms.
 # These tests are specific to options that Darwin does not support.
-# UNSUPPORTED: system-windows, system-darwin, system-aix, system-solaris
+# UNSUPPORTED: system-windows, system-cygwin, system-darwin, system-aix, system-solaris
 
 # RUN: not %{lit} -a -v %{inputs}/shtest-ulimit-nondarwin | FileCheck %s
 
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index 380b162d8c58c..53757c73fb8a6 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -75,6 +75,7 @@ CodeGen/Hexagon/masked_gather.ll
 CodeGen/NVPTX/lower-ctor-dtor.ll
 CodeGen/RISCV/zmmul.ll
 CodeGen/WebAssembly/memory-interleave.ll
+CodeGen/X86/AMX/amx-low-intrinsics.ll
 CodeGen/X86/masked_gather_scatter.ll
 CodeGen/X86/nocfivalue.ll
 DebugInfo/AArch64/ir-outliner.ll
@@ -530,32 +531,6 @@ Instrumentation/TypeSanitizer/swifterror.ll
 LTO/X86/diagnostic-handler-remarks-with-hotness.ll
 Other/optimization-remarks-auto.ll
 Other/X86/debugcounter-partiallyinlinelibcalls.ll
-tools/llvm-objcopy/ELF/auto-remove-add-symtab-shndx.test
-tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test
-tools/UpdateTestChecks/update_analyze_test_checks/loop-distribute.test
-tools/UpdateTestChecks/update_test_checks/argument_name_reuse.test
-tools/UpdateTestChecks/update_test_checks/basic.test
-tools/UpdateTestChecks/update_test_checks/check_attrs.test
-tools/UpdateTestChecks/update_test_checks/difile_absolute_filenames.test
-tools/UpdateTestChecks/update_test_checks/filter_out_after.test
-tools/UpdateTestChecks/update_test_checks/generated_funcs_prefix_reuse.test
-tools/UpdateTestChecks/update_test_checks/generated_funcs.test
-tools/UpdateTestChecks/update_test_checks/global_preserve_name.test
-tools/UpdateTestChecks/update_test_checks/if_target.test
-tools/UpdateTestChecks/update_test_checks/named_function_arguments_split.test
-tools/UpdateTestChecks/update_test_checks/on_the_fly_arg_change.test
-tools/UpdateTestChecks/update_test_checks/phi-labels.test
-tools/UpdateTestChecks/update_test_checks/pre-process.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values2.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values3.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values4.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values5.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values6.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values_funcs.test
-tools/UpdateTestChecks/update_test_checks/stable_ir_values.test
-tools/UpdateTestChecks/update_test_checks/switch_case.test
-tools/UpdateTestChecks/update_test_checks/tbaa-semantics-checks.test
-tools/UpdateTestChecks/update_test_checks/various_ir_values_dbgrecords.test
 Transforms/AtomicExpand/AArch64/atomicrmw-fp.ll
 Transforms/AtomicExpand/AArch64/expand-atomicrmw-xchg-fp.ll
 Transforms/AtomicExpand/AArch64/pcsections.ll
@@ -917,7 +892,6 @@ Transforms/InstCombine/select_frexp.ll
 Transforms/InstCombine/select.ll
 Transforms/InstCombine/select-min-max.ll
 Transforms/InstCombine/select-of-symmetric-selects.ll
-Transforms/InstCombine/select-safe-transforms.ll
 Transforms/InstCombine/select-select.ll
 Transforms/InstCombine/select-with-extreme-eq-cond.ll
 Transforms/InstCombine/shift.ll
@@ -1324,6 +1298,7 @@ Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
 Transforms/SimpleLoopUnswitch/trivial-unswitch-logical-and-or.ll
 Transforms/StackProtector/cross-dso-cfi-stack-chk-fail.ll
 Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
+Transforms/StructurizeCFG/callbr.ll
 Transforms/StructurizeCFG/hoist-zerocost.ll
 Transforms/StructurizeCFG/loop-break-phi.ll
 Transforms/StructurizeCFG/nested-loop-order.ll
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index 001339f2a8f05..3aeedaa3288a2 100644
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -1,6 +1,9 @@
 @echo off
-setlocal enabledelayedexpansion
 
+REM Filter out tests that are known to fail.
+set "LIT_FILTER_OUT=gh110231.cpp|crt_initializers.cpp|init-order-atexit.cpp|use_after_return_linkage.cpp|initialization-bug.cpp|initialization-bug-no-global.cpp|trace-malloc-unbalanced.test|trace-malloc-2.test|TraceMallocTest"
+
+setlocal enabledelayedexpansion
 goto begin
 
 :usage
@@ -24,6 +27,7 @@ echo.
 echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64
 exit /b 1
 
+
 :begin
 
 ::==============================================================================
@@ -163,7 +167,8 @@ set common_cmake_flags=^
   -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
   -DLLVM_ENABLE_RPMALLOC=ON ^
   -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld" ^
-  -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp"
+  -DLLVM_ENABLE_RUNTIMES="compiler-rt;openmp" ^
+  -DCOMPILER_RT_BUILD_ORC=OFF
 
 if "%force-msvc%" == "" (
   where /q clang-cl
@@ -224,7 +229,7 @@ ninja || ninja || ninja || exit /b 1
 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 cd..
 
@@ -249,7 +254,7 @@ ninja || ninja || ninja || exit /b 1
 REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
 REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
 ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+REM ninja check-runtimes || ninja check-runtimes || ninja check-runtimes || exit /b 1
 REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
 ninja package || exit /b 1
 cd ..
diff --git a/llvm/utils/update_llc_test_checks.py b/llvm/utils/update_llc_test_checks.py
index 8c57e75f34f75..98864be62875b 100755
--- a/llvm/utils/update_llc_test_checks.py
+++ b/llvm/utils/update_llc_test_checks.py
@@ -15,7 +15,7 @@
 import os  # Used to advertise this file's name ("autogenerated_note").
 import sys
 
-from UpdateTestChecks import common
+from UpdateTestChecks import common, mir
 
 # llc is the only llc-like in the LLVM tree but downstream forks can add
 # additional ones here if they have them.
@@ -33,6 +33,7 @@ def update_test(ti: common.TestInfo):
             break
 
     run_list = []
+    mir_run_list = []
     for l in ti.run_lines:
         if "|" not in l:
             common.warn("Skipping unparsable RUN line: " + l)
@@ -57,9 +58,14 @@ def update_test(ti: common.TestInfo):
         if m:
             march_in_cmd = m.groups()[0]
 
+        target_list = run_list
         m = common.DEBUG_ONLY_ARG_RE.search(llc_cmd)
         if m and m.groups()[0] == "isel":
             from UpdateTestChecks import isel as output_type
+        elif not m and common.STOP_PASS_RE.search(llc_cmd):
+            # MIR output mode. If -debug-only is present assume
+            # the debug output is the main point of interest.
+            target_list = mir_run_list
         else:
             from UpdateTestChecks import asm as output_type
 
@@ -84,7 +90,7 @@ def update_test(ti: common.TestInfo):
 
         # FIXME: We should use multiple check prefixes to common check lines. For
         # now, we just ignore all but the last.
-        run_list.append(
+        target_list.append(
             (
                 check_prefixes,
                 llc_tool,
@@ -119,14 +125,20 @@ def update_test(ti: common.TestInfo):
         ginfo=ginfo,
     )
 
-    for (
-        prefixes,
-        llc_tool,
-        llc_args,
-        preprocess_cmd,
-        triple_in_cmd,
-        march_in_cmd,
-    ) in run_list:
+    # Dictionary to store MIR function bodies separately
+    mir_func_dict = {}
+    for run_tuple, is_mir in [(run, False) for run in run_list] + [
+        (run, True) for run in mir_run_list
+    ]:
+        (
+            prefixes,
+            llc_tool,
+            llc_args,
+            preprocess_cmd,
+            triple_in_cmd,
+            march_in_cmd,
+        ) = run_tuple
+
         common.debug("Extracted LLC cmd:", llc_tool, llc_args)
         common.debug("Extracted FileCheck prefixes:", str(prefixes))
 
@@ -141,22 +153,54 @@ def update_test(ti: common.TestInfo):
         if not triple:
             triple = common.get_triple_from_march(march_in_cmd)
 
-        scrubber, function_re = output_type.get_run_handler(triple)
-        if 0 == builder.process_run_line(
-            function_re, scrubber, raw_tool_output, prefixes
-        ):
-            common.warn(
-                "Couldn't match any function. Possibly the wrong target triple has been provided"
+        if is_mir:
+            # MIR output mode
+            common.debug("Detected MIR output mode for prefixes:", str(prefixes))
+            for prefix in prefixes:
+                if prefix not in mir_func_dict:
+                    mir_func_dict[prefix] = {}
+
+            mir.build_function_info_dictionary(
+                ti.path,
+                raw_tool_output,
+                triple,
+                prefixes,
+                mir_func_dict,
+                ti.args.verbose,
             )
-        builder.processed_prefixes(prefixes)
+        else:
+            # ASM output mode
+            scrubber, function_re = output_type.get_run_handler(triple)
+            if 0 == builder.process_run_line(
+                function_re, scrubber, raw_tool_output, prefixes
+            ):
+                common.warn(
+                    "Couldn't match any function. Possibly the wrong target triple has been provided"
+                )
+            builder.processed_prefixes(prefixes)
 
     func_dict = builder.finish_and_get_func_dict()
+
+    # Check for conflicts: same prefix used for both ASM and MIR
+    conflicting_prefixes = set(func_dict.keys()) & set(mir_func_dict.keys())
+    if conflicting_prefixes:
+        common.warn(
+            "The following prefixes are used for both ASM and MIR output, which will cause FileCheck failures: {}".format(
+                ", ".join(sorted(conflicting_prefixes))
+            ),
+            test_file=ti.path,
+        )
+        for prefix in conflicting_prefixes:
+            mir_func_dict[prefix] = {}
+            func_dict[prefix] = {}
+
     global_vars_seen_dict = {}
 
     is_in_function = False
     is_in_function_start = False
     func_name = None
     prefix_set = set([prefix for p in run_list for prefix in p[0]])
+    prefix_set.update([prefix for p in mir_run_list for prefix in p[0]])
     common.debug("Rewriting FileCheck prefixes:", str(prefix_set))
     output_lines = []
 
@@ -221,6 +265,22 @@ def update_test(ti: common.TestInfo):
                         is_filtered=builder.is_filtered(),
                     )
                 )
+
+                # Also add MIR checks if we have them for this function
+                if mir_run_list and func_name:
+                    mir.add_mir_checks_for_function(
+                        ti.path,
+                        output_lines,
+                        mir_run_list,
+                        mir_func_dict,
+                        func_name,
+                        single_bb=False,  # Don't skip basic block labels.
+                        print_fixed_stack=False,  # Don't print fixed stack (ASM tests don't need it).
+                        first_check_is_next=False,  # First check is LABEL, not NEXT.
+                        at_the_function_name=False,  # Use "name:" not "@name".
+                        check_indent="",  # No indentation for IR files (not MIR files).
+                    )
+
                 is_in_function_start = False
 
             if is_in_function:
diff --git a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
index 3c87c453a4cf0..5b7b45fdd1d58 100644
--- a/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
+++ b/mlir/include/mlir/Analysis/DataFlow/DenseAnalysis.h
@@ -127,6 +127,18 @@ class AbstractDenseForwardDataFlowAnalysis : public DataFlowAnalysis {
   /// them into the same equivalent class.
   virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {}
 
+  /// Visit a block and propagate the dense lattice forward along the control
+  /// flow edge from predecessor to block. `point` corresponds to the program
+  /// point before `block`. The default implementation merges in the state from
+  /// the predecessor's terminator.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *predecessor,
+                                  const AbstractDenseLattice &before,
+                                  AbstractDenseLattice *after) {
+    // Merge in the state from the predecessor's terminator.
+    join(after, before);
+  }
+
   /// Propagate the dense lattice forward along the control flow edge from
   /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt`
   /// values correspond to control flow branches originating at or targeting the
@@ -259,6 +271,22 @@ class DenseForwardDataFlowAnalysis
         branch, regionFrom, regionTo, before, after);
   }
 
+  /// Hook for customizing the behavior of lattice propagation along the control
+  /// flow edges between blocks. The control flows from `predecessor` to
+  /// `block`. The lattice is propagated forward along this edge. The lattices
+  /// are as follows:
+  ///   - `before` is the lattice at the end of the predecessor block;
+  ///   - `after` is the lattice at the beginning of the block.
+  /// By default, the `after` state is simply joined with the `before` state.
+  /// Concrete analyses can override this behavior or delegate to the parent
+  /// call for the default behavior.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *predecessor, const LatticeT &before,
+                                  LatticeT *after) {
+    AbstractDenseForwardDataFlowAnalysis::visitBlockTransfer(
+        block, point, predecessor, before, after);
+  }
+
 protected:
   /// Get the dense lattice on this lattice anchor.
   LatticeT *getLattice(LatticeAnchor anchor) override {
@@ -306,6 +334,13 @@ class DenseForwardDataFlowAnalysis
                                          static_cast<const LatticeT &>(before),
                                          static_cast<LatticeT *>(after));
   }
+  void visitBlockTransfer(Block *block, ProgramPoint *point, Block *predecessor,
+                          const AbstractDenseLattice &before,
+                          AbstractDenseLattice *after) final {
+    visitBlockTransfer(block, point, predecessor,
+                       static_cast<const LatticeT &>(before),
+                       static_cast<LatticeT *>(after));
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -388,6 +423,17 @@ class AbstractDenseBackwardDataFlowAnalysis : public DataFlowAnalysis {
   /// them into the same equivalent class.
   virtual void buildOperationEquivalentLatticeAnchor(Operation *op) {}
 
+  /// Visit a block and propagate the dense lattice backward along the control
+  /// flow edge from successor to block. `point` corresponds to the program
+  /// point after `block`. The default implementation merges in the state from
+  /// the successor's first operation or the block itself when empty.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *successor,
+                                  const AbstractDenseLattice &after,
+                                  AbstractDenseLattice *before) {
+    meet(before, after);
+  }
+
   /// Propagate the dense lattice backwards along the control flow edge from
   /// `regionFrom` to `regionTo` regions of the `branch` operation. `nullopt`
   /// values correspond to control flow branches originating at or targeting the
@@ -531,6 +577,22 @@ class DenseBackwardDataFlowAnalysis
         branch, regionFrom, regionTo, after, before);
   }
 
+  /// Hook for customizing the behavior of lattice propagation along the control
+  /// flow edges between blocks. The control flows from `successor` to
+  /// `block`. The lattice is propagated back along this edge. The lattices
+  /// are as follows:
+  ///   - `after` is the lattice at the beginning of the successor block;
+  ///   - `before` is the lattice at the end of the block.
+  /// By default, the `before` state is simply met with the `after` state.
+  /// Concrete analyses can override this behavior or delegate to the parent
+  /// call for the default behavior.
+  virtual void visitBlockTransfer(Block *block, ProgramPoint *point,
+                                  Block *successor, const LatticeT &after,
+                                  LatticeT *before) {
+    AbstractDenseBackwardDataFlowAnalysis::visitBlockTransfer(
+        block, point, successor, after, before);
+  }
+
 protected:
   /// Get the dense lattice at the given lattice anchor.
   LatticeT *getLattice(LatticeAnchor anchor) override {
@@ -577,6 +639,13 @@ class DenseBackwardDataFlowAnalysis
                                          static_cast<const LatticeT &>(after),
                                          static_cast<LatticeT *>(before));
   }
+  void visitBlockTransfer(Block *block, ProgramPoint *point, Block *successor,
+                          const AbstractDenseLattice &after,
+                          AbstractDenseLattice *before) final {
+    visitBlockTransfer(block, point, successor,
+                       static_cast<const LatticeT &>(after),
+                       static_cast<LatticeT *>(before));
+  }
 };
 
 } // end namespace dataflow
diff --git a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
index 964281592cc65..cad6cec761ab8 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/VectorPattern.h
@@ -92,12 +92,43 @@ class VectorConvertToLLVMPattern : public ConvertOpToLLVMPattern<SourceOp> {
   using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
   using Super = VectorConvertToLLVMPattern<SourceOp, TargetOp>;
 
+  /// Return the given type if it's a floating point type. If the given type is
+  /// a vector type, return its element type if it's a floating point type.
+  static FloatType getFloatingPointType(Type type) {
+    if (auto floatType = dyn_cast<FloatType>(type))
+      return floatType;
+    if (auto vecType = dyn_cast<VectorType>(type))
+      return dyn_cast<FloatType>(vecType.getElementType());
+    return nullptr;
+  }
+
   LogicalResult
   matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     static_assert(
         std::is_base_of<OpTrait::OneResult<SourceOp>, SourceOp>::value,
         "expected single result op");
+
+    // The pattern should not apply if a floating-point operand is converted to
+    // a non-floating-point type. This indicates that the floating point type
+    // is not supported by the LLVM lowering. (Such types are converted to
+    // integers.)
+    auto checkType = [&](Value v) -> LogicalResult {
+      FloatType floatType = getFloatingPointType(v.getType());
+      if (!floatType)
+        return success();
+      Type convertedType = this->getTypeConverter()->convertType(floatType);
+      if (!isa_and_nonnull<FloatType>(convertedType))
+        return rewriter.notifyMatchFailure(op,
+                                           "unsupported floating point type");
+      return success();
+    };
+    for (Value operand : op->getOperands())
+      if (failed(checkType(operand)))
+        return failure();
+    if (failed(checkType(op->getResult(0))))
+      return failure();
+
     // Determine attributes for the target op
     AttrConvert<SourceOp, TargetOp> attrConvert(op);
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index ba5e48e4ec9ba..10f0cc254ea97 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -476,9 +476,9 @@ def ReduxKind : I32EnumAttr<"ReduxKind", "NVVM redux kind",
 def ReduxKindAttr : EnumAttr<NVVM_Dialect, ReduxKind, "redux_kind">;
 
 def NVVM_ReduxOp :
-  NVVM_Op<"redux.sync", [NVVMRequiresSM<80>]>,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_Type:$val,
+  NVVM_Op<"redux.sync", [NVVMRequiresSM<80>, AllTypesMatch<["res", "val"]>]>,
+  Results<(outs AnyTypeOf<[I32, F32]>:$res)>,
+  Arguments<(ins AnyTypeOf<[I32, F32]>:$val,
                  ReduxKindAttr:$kind,
                  I32:$mask_and_clamp,
                  DefaultValuedAttr<BoolAttr, "false">:$abs,
@@ -496,6 +496,8 @@ def NVVM_ReduxOp :
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync)
   }];
+  let hasVerifier = 1;
+
   string llvmBuilder = [{
       auto intId = getReduxIntrinsicId($_resultType, $kind, $abs, $nan);
       $res = createIntrinsicCall(builder, intId, {$val, $mask_and_clamp});
@@ -656,8 +658,8 @@ def NVVM_MBarrierInvalOp : NVVM_Op<"mbarrier.inval">,
 }
 
 def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr)> {
+  Results<(outs I64:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr)> {
   let summary = "MBarrier Arrive Operation";
   let description = [{
     The `nvvm.mbarrier.arrive` operation performs an arrive-on operation on the 
@@ -674,36 +676,32 @@ def NVVM_MBarrierArriveOp : NVVM_Op<"mbarrier.arrive">,
     value are implementation-specific.
 
     The operation takes the following operand:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive, {$addr});
-  }];
   let assemblyFormat = "$addr attr-dict `:` type($addr) `->` type($res)";
-}
-
-def NVVM_MBarrierArriveSharedOp : NVVM_Op<"mbarrier.arrive.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr)> {
-  let summary = "Shared MBarrier Arrive Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.arrive` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_shared, {$addr});
+    auto [id, args] = NVVM::MBarrierArriveOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr attr-dict `:` qualified(type($addr)) `->` type($res)";
 }
 
 def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr, I32:$count)> {
+  Results<(outs I64:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+                 I32:$count)> {
   let summary = "MBarrier Arrive No-Complete Operation";
   let description = [{
     The `nvvm.mbarrier.arrive.nocomplete` operation performs an arrive-on operation 
@@ -721,33 +719,29 @@ def NVVM_MBarrierArriveNocompleteOp : NVVM_Op<"mbarrier.arrive.nocomplete">,
     captures the phase of the *mbarrier object* prior to the arrive-on operation.
 
     The operation takes the following operands:
-    - `addr`: A pointer to the memory location of the *mbarrier object*. Uses generic 
-      addressing, but the address must still be in the shared memory space.
+    - `addr`: A pointer to the memory location of the *mbarrier object*. The `addr`
+      must be a pointer to generic or shared::cta memory. When it is generic, the
+      underlying address must be within the shared::cta memory space; otherwise
+      the behavior is undefined.
     - `count`: Integer specifying the count argument to the arrive-on operation. 
       Must be in the valid range as specified in the *mbarrier object* contents.
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete, {$addr, $count});
-  }];
-  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
-}
 
-def NVVM_MBarrierArriveNocompleteSharedOp : NVVM_Op<"mbarrier.arrive.nocomplete.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr, I32:$count)> {
-  let summary = "Shared MBarrier Arrive No-Complete Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.arrive.nocomplete` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared, {$addr, $count});
+    auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr `,` $count attr-dict `:` type(operands) `->` type($res)";
 }
 
 def NVVM_MBarrierArriveExpectTxOp : NVVM_PTXBuilder_Op<"mbarrier.arrive.expect_tx">,  
@@ -896,8 +890,9 @@ def NVVM_MBarrierTryWaitParitySharedOp : NVVM_PTXBuilder_Op<"mbarrier.try_wait.p
 }
 
 def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_AnyPointer:$addr, LLVM_Type:$state)> {
+  Results<(outs I1:$res)>,
+  Arguments<(ins AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+                 I64:$state)> {
   let summary = "MBarrier Non-Blocking Test Wait Operation";
   let description = [{
     The `nvvm.mbarrier.test.wait` operation performs a non-blocking test for the 
@@ -944,26 +939,20 @@ def NVVM_MBarrierTestWaitOp : NVVM_Op<"mbarrier.test.wait">,
 
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
   }];
-  string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait, {$addr, $state});
-  }];
-  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
-}
 
-def NVVM_MBarrierTestWaitSharedOp : NVVM_Op<"mbarrier.test.wait.shared">,
-  Results<(outs LLVM_Type:$res)>,
-  Arguments<(ins LLVM_PointerShared:$addr, LLVM_Type:$state)> {
-  let summary = "Shared MBarrier Non-Blocking Test Wait Operation";
-  let description = [{
-    This Op is the same as `nvvm.mbarrier.test.wait` except that the *mbarrier object*
-    should be accessed using a shared-memory pointer instead of a generic-memory pointer.
+  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
 
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-try-wait)
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
   }];
+
   string llvmBuilder = [{
-      $res = createIntrinsicCall(builder, llvm::Intrinsic::nvvm_mbarrier_test_wait_shared, {$addr, $state});
+    auto [id, args] = NVVM::MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    $res = createIntrinsicCall(builder, id, args);
   }];
-  let assemblyFormat = "$addr `,` $state attr-dict `:` type(operands) `->` type($res)";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1326,9 +1315,9 @@ def ShflKindAttr : EnumAttr<NVVM_Dialect, ShflKind, "shfl_kind">;
 
 def NVVM_ShflOp :
   NVVM_Op<"shfl.sync", [NVVMRequiresSM<30>]>,
-  Results<(outs LLVM_Type:$res)>,
+  Results<(outs AnyTypeOf<[I32, F32, LLVMStructType]>:$res)>,
   Arguments<(ins I32:$thread_mask,
-                 LLVM_Type:$val,
+                 AnyTypeOf<[I32, F32]>:$val,
                  I32:$offset,
                  I32:$mask_and_clamp,
                  ShflKindAttr:$kind,
@@ -1344,6 +1333,11 @@ def NVVM_ShflOp :
     a mask for logically splitting warps into sub-segments and an upper bound
     for clamping the source lane index.
     
+    The `return_value_and_is_valid` unit attribute can be specified to indicate 
+    that the return value is a two-element struct, where the first element is 
+    the result value and the second element is a predicate indicating if the 
+    computed source lane index is valid.
+    
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-shfl-sync)
   }];
   string llvmBuilder = [{
@@ -1534,47 +1528,30 @@ def NVVM_CpAsyncMBarrierArriveOp : NVVM_Op<"cp.async.mbarrier.arrive"> {
     The `cp.async.mbarrier.arrive` Op makes the *mbarrier object* track
     all prior cp.async operations initiated by the executing thread.
     The `addr` operand specifies the address of the *mbarrier object*
-    in generic address space. The `noinc` attr impacts how the
-    mbarrier's state is updated.
+    in generic or shared::cta address space. When it is generic, the
+    underlying memory should fall within the shared::cta space;
+    otherwise the behavior is undefined. The `noinc` attr impacts
+    how the mbarrier's state is updated.
     
     [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
   }];
-  let assemblyFormat = "$addr attr-dict `:` type(operands)";
 
   let arguments = (ins
-    LLVM_AnyPointer:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc);
-
-  string llvmBuilder = [{
-    auto intId = $noinc ?
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc :
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive;
+    AnyTypeOf<[LLVM_PointerGeneric, LLVM_PointerShared]>:$addr,
+    DefaultValuedAttr<I1Attr, "0">:$noinc);
 
-    createIntrinsicCall(builder, intId, {$addr});
-  }];
-}
-
-def NVVM_CpAsyncMBarrierArriveSharedOp : NVVM_Op<"cp.async.mbarrier.arrive.shared"> {
-  let summary = "NVVM Dialect Op for cp.async.mbarrier.arrive.shared";
-  let description = [{
-    The `cp.async.mbarrier.arrive.shared` Op makes the *mbarrier object*
-    track all prior cp.async operations initiated by the executing thread.
-    The `addr` operand specifies the address of the *mbarrier object* in
-    shared memory. The `noinc` attr impacts how the mbarrier's state
-    is updated. 
-    
-    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive)
-  }];
   let assemblyFormat = "$addr attr-dict `:` type(operands)";
 
-  let arguments = (ins
-    LLVM_PointerShared:$addr, DefaultValuedAttr<I1Attr, "0">:$noinc);
+  let extraClassDeclaration = [{
+    static mlir::NVVM::IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
+                          llvm::IRBuilderBase& builder);
+  }];
 
   string llvmBuilder = [{
-    auto intId = $noinc ?
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared :
-      llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared;
-
-    createIntrinsicCall(builder, intId, {$addr});
+    auto [id, args] = NVVM::CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs(
+                      *op, moduleTranslation, builder);
+    createIntrinsicCall(builder, id, args);
   }];
 }
 
@@ -1589,10 +1566,11 @@ def FPRoundingModeRM   : I32EnumAttrCase<"RM",   2, "rm">;
 def FPRoundingModeRP   : I32EnumAttrCase<"RP",   3, "rp">;
 def FPRoundingModeRZ   : I32EnumAttrCase<"RZ",   4, "rz">;
 def FPRoundingModeRNA  : I32EnumAttrCase<"RNA",  5, "rna">;
+def FPRoundingModeRS   : I32EnumAttrCase<"RS",   6, "rs">;
 
 def FPRoundingMode : I32EnumAttr<"FPRoundingMode", "NVVM FPRoundingMode kind",
   [FPRoundingModeNone, FPRoundingModeRN, FPRoundingModeRM,
-    FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA]> {
+    FPRoundingModeRP, FPRoundingModeRZ, FPRoundingModeRNA, FPRoundingModeRS]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
 }
@@ -1906,6 +1884,96 @@ def NVVM_ConvertF6x2ToF16x2Op :
 def NVVM_ConvertF4x2ToF16x2Op :
   NVVM_ConvertToFP16x2Op_Base<"F4", I8, "F16">;
 
+//===----------------------------------------------------------------------===//
+// NVVM Stochastic Rounding Conversion Ops
+//===----------------------------------------------------------------------===//
+
+// Base class for conversions from F32x2 to FPx2 formats
+// (F16x2, BF16x2)
+// TODO: In separate PR, add .rn and .rz rounding variants for this conversion
+// as currently only support .rs rounding mode
+class NVVM_ConvertF32x2ToFPx2OpBase<string dstFormat, string mnemonic, Type dstType> :
+  NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>,
+  Results<(outs dstType:$dst)>,
+  Arguments<(ins F32:$src_hi, F32:$src_lo, I32:$rbits,
+                 DefaultValuedAttr<FPRoundingModeAttr, "FPRoundingMode::RS">:$rnd,
+                 DefaultValuedAttr<SaturationModeAttr, "SaturationMode::NONE">:$sat,
+                 DefaultValuedAttr<BoolAttr, "false">:$relu)> {
+  let summary = "Convert two F32 values to packed " # dstFormat # " with stochastic rounding (.rs)";
+  let description = [{
+    Converts two F32 values to packed }] # dstFormat # [{ format using stochastic 
+    rounding (.rs) mode with randomness provided by the `rbits` parameter. The 
+    `relu` attribute clamps negative results to 0. The `sat` attribute determines 
+    saturation behavior. The `src_hi` and `src_lo` parameters correspond to operands 
+    `a` and `b` in the PTX ISA, respectively.
+    
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+  
+  let assemblyFormat = "$src_hi `,` $src_lo `,` $rbits attr-dict `:` type($dst)";
+
+  let hasVerifier = 1;
+  
+  let extraClassDeclaration = [{
+    llvm::Intrinsic::ID getIntrinsicID();
+  }];
+  
+  string llvmBuilder = [{
+    auto intId = op.getIntrinsicID();
+    $dst = createIntrinsicCall(builder, intId, {$src_hi, $src_lo, $rbits});
+  }];
+  }
+
+// F32x2 -> F16x2 with stochastic rounding
+def NVVM_ConvertF32x2ToF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"f16x2", "convert.f32x2.to.f16x2", VectorOfLengthAndType<[2], [F16]>>;
+
+// F32x2 -> BF16x2 with stochastic rounding
+def NVVM_ConvertF32x2ToBF16x2Op : NVVM_ConvertF32x2ToFPx2OpBase<"bf16x2", "convert.f32x2.to.bf16x2", VectorOfLengthAndType<[2], [BF16]>>;
+
+// Base class for stochastic rounding conversions from F32x4 to FPx4 formats
+// (E4M3x4, E5M2x4, E2M3x4, E3M2x4, E2M1x4)
+// These operations always use RS (stochastic rounding) mode with SATFINITE saturation.
+class NVVM_ConvertF32x4ToFPx4OpBase<string dstFormat, string mnemonic, Type dstType> :
+  NVVM_Op<mnemonic, [Pure, NVVMRequiresSMa<[100, 103]>]>,
+  Results<(outs dstType:$dst)>,
+  Arguments<(ins VectorOfLengthAndType<[4], [F32]>:$src, I32:$rbits,
+                 DefaultValuedAttr<BoolAttr, "false">:$relu,
+                 TypeAttr:$dstTy)> {
+  let summary = "Convert vector<4xf32> to packed " # dstFormat # " with stochastic rounding (.rs) and satfinite";
+  let description = [{
+    Converts a vector<4xf32> to packed }] # dstFormat # [{ format using 
+    stochastic rounding (.rs) mode with SATFINITE saturation. Randomness is 
+    provided by the `rbits` parameter. The `dstTy` attribute specifies the 
+    target floating-point format. The `relu` attribute clamps negative results to 0.
+    
+    Note: These operations always use RS rounding mode and SATFINITE saturation mode.
+    
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+  
+  let assemblyFormat = "$src `,` $rbits attr-dict `:` type($src) `->` type($dst) `(` $dstTy `)`";
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    llvm::Intrinsic::ID getIntrinsicID();
+  }];
+  
+  string llvmBuilder = [{
+    auto intId = op.getIntrinsicID();
+    $dst = createIntrinsicCall(builder, intId, {$src, $rbits});
+  }];
+}
+
+// F32x4 -> F8x4 with stochastic rounding (supports E4M3FN, E5M2)
+def NVVM_ConvertF32x4ToF8x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f8x4", "convert.f32x4.to.f8x4", VectorOfLengthAndType<[4], [I8]>>;
+
+// F32x4 -> F6x4 with stochastic rounding (supports E2M3FN, E3M2FN)
+def NVVM_ConvertF32x4ToF6x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f6x4", "convert.f32x4.to.f6x4", VectorOfLengthAndType<[4], [I8]>>;
+
+// F32x4 -> F4x4 with stochastic rounding (supports E2M1FN)
+def NVVM_ConvertF32x4ToF4x4Op : NVVM_ConvertF32x4ToFPx4OpBase<"f4x4", "convert.f32x4.to.f4x4", I16>;
+
 //===----------------------------------------------------------------------===//
 // NVVM MMA Ops
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 5241f9a6f2b43..921fdf36a59b0 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -692,6 +692,38 @@ def ROCDL_GlobalLoadLDSOp :
   }];
 }
 
+//===---------------------------------------------------------------------===//
+// Async load to LDS intrinsic (available in GFX1250)
+//===---------------------------------------------------------------------===//
+
+foreach bitsVal = [8, 32, 64, 128] in {
+  defvar bitsStr = "b" # !cast<string>(bitsVal);
+  def ROCDL_GlobalLoadAsyncToLDS # !toupper(bitsStr) # Op :
+    ROCDL_IntrOp<"global.load.async.to.lds." # bitsStr, [], [], [], 0, 0, 1, 0, [2, 3], ["offset", "aux"]> {
+    dag args = (ins Arg<ROCDLGlobalBuffer, "", [MemRead]>:$globalPtr,
+                   Arg<ROCDLBufferLDS, "", [MemWrite]>:$ldsPtr,
+                   I32Attr:$offset,
+                   I32Attr:$aux);
+    let arguments = !con(args, baseArgs);
+    let assemblyFormat = [{
+      $globalPtr `,`  $ldsPtr `,` $offset `,` $aux
+      attr-dict `:` type($globalPtr) `,` type($ldsPtr)
+    }];
+    let description = [{
+      Asynchronously loads }] # !cast<string>(bitsVal) # [{ bits of data from a global memory pointer
+      to a Local Data Share (LDS) pointer.
+
+      Available on gfx1250+.
+    }];
+
+    let extraClassDefinition = [{
+      ::llvm::SmallVector<::mlir::Value> $cppClass::getAccessedOperands() {
+        return {getGlobalPtr(), getLdsPtr()};
+      }
+    }];
+  }
+}
+
 //===---------------------------------------------------------------------===//
 // Tensor load/store intrinsics (available in GFX1250)
 //===---------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 2f4517ddfe754..c689b7e46ea9e 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2557,6 +2557,12 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     device-type-aware getter methods. When modifying these operands, the
     corresponding `device_type` attributes must be updated to maintain
     consistency between operands and their target device types.
+
+    The `unstructured` attribute indicates that the loops inside the OpenACC
+    construct contain early exits and cannot be lowered to structured MLIR
+    operations. When this flag is set, the acc.loop should have no induction
+    variables and the loop must be implemented via explicit control flow
+    inside its body.
   }];
 
   let arguments = (ins
@@ -2590,7 +2596,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
       OptionalAttr<SymbolRefArrayAttr>:$firstprivatizationRecipes,
       Variadic<AnyType>:$reductionOperands,
       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-      OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined
+      OptionalAttr<OpenACC_CombinedConstructsAttr>:$combined,
+      UnitAttr:$unstructured
   );
 
   let results = (outs Variadic<AnyType>:$results);
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
index 1481859e94a92..0c059967bb898 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -30,9 +30,11 @@ class SliceAttr;
 } // namespace xegpu
 } // namespace mlir
 
+// clang-format off
+#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrInterface.h.inc>
 #include <mlir/Dialect/XeGPU/IR/XeGPUDialect.h.inc>
-#include <mlir/Dialect/XeGPU/IR/XeGPUEnums.h.inc>
+// clang-format on
 
 #define GET_ATTRDEF_CLASSES
 #include <mlir/Dialect/XeGPU/IR/XeGPUAttrs.h.inc>
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 40352b44b6441..9c35c07a7e587 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -223,17 +223,17 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
     InterfaceMethod<"Derive a new layout by dropping InstData",
                     "xegpu::DistributeLayoutAttr",
                     "dropInstData">,
-    InterfaceMethod<[{Delinearizes a linear subgroup ID into its multidimensional
-                      indices based on the effective subgroup layout.}],
+    InterfaceMethod<[{Delinearizes a linear ID into its multidimensional
+                      indices based on the effective layout level.}],
                     "FailureOr<SmallVector<Value>>",
-                    "delinearizeSubgroupId",
+                    "delinearizeId",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId)>,
-    InterfaceMethod<[{Generates instructions to compute multidimensional offsets for blocks
-                      assigned to a subgroup identified by linearId. The shape parameter
-                      represents the workgroup-level problem size. Each subgroup may access
+    InterfaceMethod<[{Generates instructions to compute multidimensional coordinates for dist units
+                      assigned to a level identified by linearId. The shape parameter
+                      represents the higher-level problem size. Each level may access
                       multiple blocks according to round-robin distribution rules.}],
                     "FailureOr<SmallVector<SmallVector<Value>>>",
-                    "getOffsets",
+                    "computeDistributedCoords",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape)>,
     InterfaceMethod</*desc=*/[{Check if this layout can be achieved by applying a transpose
                      to some other layout according to given permutation of (0...n-1).}],
@@ -476,17 +476,17 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
       return {};
     }
 
-    /// Delinearizes a linear subgroup ID into its multidimensional indices
-    /// based on the effective subgroup layout.
+    /// Delinearizes a linear ID into its multidimensional indices
+    /// based on the effective level of the layout.
     FailureOr<SmallVector<Value>>
-    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+    delinearizeId(OpBuilder &builder, Location loc, Value linearId);
 
-    /// Generates instructions to compute multidimensional offsets for blocks
-    /// assigned to a subgroup identified by linearId. The shape parameter
-    /// represents the workgroup-level problem size. Each subgroup may access
+    /// Generates instructions to compute multidimensional coordinates for dist units
+    /// assigned to a level identified by linearId. The shape parameter
+    /// represents the higher-level problem size. Each `level` may access
     /// multiple blocks according to round-robin distribution rules.
     FailureOr<SmallVector<SmallVector<Value>>>
-    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+    computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
@@ -643,14 +643,15 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
     /// Delinearizes a linear subgroup ID into its multidimensional indices
     /// based on the effective subgroup layout.
     FailureOr<SmallVector<Value>>
-    delinearizeSubgroupId(OpBuilder &builder, Location loc, Value linearId);
+    delinearizeId(OpBuilder &builder, Location loc, Value linearId);
 
-    /// Generates instructions to compute multidimensional offsets for blocks
+    /// Generates instructions to compute multidimensional coordinates for blocks
     /// assigned to a subgroup identified by linearId. The shape parameter
     /// represents the workgroup-level problem size. Each subgroup may access
     /// multiple blocks according to round-robin distribution rules.
+
     FailureOr<SmallVector<SmallVector<Value>>>
-    getOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
+    computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape);
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 426377fcf598f..689ebd0d1179a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -843,7 +843,8 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
       AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
+      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
+      OptionalAttr<XeGPU_LayoutAttr>:$layout);
   let results = (outs AnyTypeOf<[XeGPU_ValueType, XeGPU_ScalarType]>:$value);
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
@@ -895,7 +896,14 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [MemoryEffects<[MemRead]>]> {
                     "IntegerAttr": $chunk_size,
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
-                    "xegpu::CachePolicyAttr": $l3_hint)>
+                    "xegpu::CachePolicyAttr": $l3_hint)>,
+    OpBuilder<(ins "Type": $value, "Value": $source,
+                    "ArrayRef<OpFoldResult>": $offsets, "Value": $mask,
+                    "IntegerAttr": $chunk_size,
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
+                    "xegpu::CachePolicyAttr": $l3_hint,
+                    "xegpu::LayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
@@ -979,7 +987,8 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
       AnyTypeOf<[XeGPU_MaskType, I1]>:$mask, OptionalAttr<I64Attr>:$chunk_size,
       OptionalAttr<XeGPU_CacheHintAttr>:$l1_hint,
       OptionalAttr<XeGPU_CacheHintAttr>:$l2_hint,
-      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint);
+      OptionalAttr<XeGPU_CacheHintAttr>:$l3_hint,
+      OptionalAttr<XeGPU_LayoutAttr>:$layout);
 
   let extraClassDeclaration = extraBaseClassDeclaration#[{
     Type getDestType() {
@@ -1030,7 +1039,14 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [MemoryEffects<[MemWrite]>]> {
                     "IntegerAttr": $chunk_size,
                     "xegpu::CachePolicyAttr": $l1_hint,
                     "xegpu::CachePolicyAttr": $l2_hint,
-                    "xegpu::CachePolicyAttr": $l3_hint)>
+                    "xegpu::CachePolicyAttr": $l3_hint)>,
+    OpBuilder<(ins "Value": $value, "Value": $dest,
+                    "ArrayRef<OpFoldResult>": $offsets, "Value": $mask,
+                    "IntegerAttr": $chunk_size,
+                    "xegpu::CachePolicyAttr": $l1_hint,
+                    "xegpu::CachePolicyAttr": $l2_hint,
+                    "xegpu::CachePolicyAttr": $l3_hint,
+                    "xegpu::LayoutAttr": $layout)>
    ];
 
   let hasVerifier = 1;
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
index b7af5413669c9..e42799689e490 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td
@@ -26,7 +26,7 @@ def XeGPUSubgroupDistribute : Pass<"xegpu-subgroup-distribute"> {
     The pass distributes subgroup level (SIMD) XeGPU ops to work items.
   }];
   let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
-                           "vector::VectorDialect"];
+                           "vector::VectorDialect", "index::IndexDialect"];
 }
 
 def XeGPUPropagateLayout : Pass<"xegpu-propagate-layout"> {
@@ -85,4 +85,16 @@ def XeGPUVectorLinearize : Pass<"xegpu-vector-linearize"> {
                            "scf::SCFDialect", "ub::UBDialect", "vector::VectorDialect"];
 }
 
+def XeGPUOptimizeBlockLoads : Pass<"xegpu-optimize-block-loads"> {
+  let summary = "Optimize XeGPU block load operations";
+  let description = [{
+    This pass rewrites XeGPU loadNd operations into more optimal forms
+    to improve performance. This includes,
+    - Rewriting transpose B loads into more optimal forms to use HW block
+      transpose instructions for better performance.
+  }];
+  let dependentDialects = ["memref::MemRefDialect", "xegpu::XeGPUDialect",
+                           "vector::VectorDialect"];
+}
+
 #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
index a480195eebd00..1776a209d0bf1 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Transforms.h
@@ -61,7 +61,8 @@ struct UnrollOptions {
 
 /// Appends patterns for folding aliasing ops into XeGPU ops into `patterns`.
 void populateXeGPUFoldAliasOpsPatterns(RewritePatternSet &patterns);
-
+/// Appends patterns for optimizing block load operations into `patterns`.
+void populateXeGPUOptimizeBlockLoadsPatterns(RewritePatternSet &patterns);
 /// Appends patterns for XeGPU SIMT distribution into `patterns`.
 void populateXeGPUSubgroupDistributePatterns(RewritePatternSet &patterns);
 /// Appends patterns for moving function body into gpu.warp_execute_on_lane0 op.
diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
index 04cfd58d846a7..58092c3bb9ed2 100644
--- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
+++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h
@@ -104,11 +104,15 @@ void removeLayoutAttrs(Operation *op);
 
 /// Sets the DistributeLayoutAttr for a given OpOperand or OpResult by attaching
 /// it to the owner's dictionary attributes
+/// If `respectPermLayout` is true the existing permament layout
+/// attribute will be kept and assigned to the attribute dict instead
+/// of the provided layout.
 template <typename T,
           typename = std::enable_if_t<std::is_same_v<T, OpOperand> ||
                                       std::is_same_v<T, OpResult>>>
 void setDistributeLayoutAttr(const T &operandOrResult,
-                             const DistributeLayoutAttr layout);
+                             const DistributeLayoutAttr layout,
+                             bool respectPermLayout = false);
 
 /// Set the DistributeLayoutAttr for each OpOperand and OpResult of the given
 /// operation. If the operation contains regions, it is also applied recursively
@@ -162,6 +166,15 @@ SmallVector<OpFoldResult> addElementwise(OpBuilder &builder, Location loc,
 SmallVector<OpFoldResult> addWithRightAligned(OpBuilder &builder, Location loc,
                                               ArrayRef<OpFoldResult> lhs,
                                               ArrayRef<OpFoldResult> rhs);
+
+/// Helper Function to find a proper instruction multiple for the user-supplied
+/// sg-level data shape (diven by `dim`). `candidates` are uArch allowed shapes.
+/// `candidateMultiples` are uArch multiples of such shapes (i.e. block count or
+/// array length).
+template <typename T>
+int getLargestDivisor(T dim, ArrayRef<T> candidates,
+                      ArrayRef<T> candidateMultiples = {});
+
 } // namespace xegpu
 
 } // namespace mlir
diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h
index ed7e2a08ebfd9..5ac9e26e8636d 100644
--- a/mlir/include/mlir/Transforms/DialectConversion.h
+++ b/mlir/include/mlir/Transforms/DialectConversion.h
@@ -981,6 +981,28 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// Return a reference to the internal implementation.
   detail::ConversionPatternRewriterImpl &getImpl();
 
+  /// Attempt to legalize the given operation. This can be used within
+  /// conversion patterns to change the default pre-order legalization order.
+  /// Returns "success" if the operation was legalized, "failure" otherwise.
+  ///
+  /// Note: In a partial conversion, this function returns "success" even if
+  /// the operation could not be legalized, as long as it was not explicitly
+  /// marked as illegal in the conversion target.
+  LogicalResult legalize(Operation *op);
+
+  /// Attempt to legalize the given region. This can be used within
+  /// conversion patterns to change the default pre-order legalization order.
+  /// Returns "success" if the region was legalized, "failure" otherwise.
+  ///
+  /// If the current pattern runs with a type converter, the entry block
+  /// signature will be converted before legalizing the operations in the
+  /// region.
+  ///
+  /// Note: In a partial conversion, this function returns "success" even if
+  /// an operation could not be legalized, as long as it was not explicitly
+  /// marked as illegal in the conversion target.
+  LogicalResult legalize(Region *r);
+
 private:
   // Allow OperationConverter to construct new rewriters.
   friend struct OperationConverter;
@@ -989,7 +1011,8 @@ class ConversionPatternRewriter final : public PatternRewriter {
   /// conversions. They apply some IR rewrites in a delayed fashion and could
   /// bring the IR into an inconsistent state when used standalone.
   explicit ConversionPatternRewriter(MLIRContext *ctx,
-                                     const ConversionConfig &config);
+                                     const ConversionConfig &config,
+                                     OperationConverter &converter);
 
   // Hide unsupported pattern rewriter API.
   using OpBuilder::setListener;
diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
index 0682e5f26785a..22bc0b32a9bd1 100644
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -266,9 +266,10 @@ void AbstractDenseForwardDataFlowAnalysis::visitBlock(Block *block) {
     }
 
     LDBG() << "    Joining state from predecessor " << predecessor;
+    const AbstractDenseLattice &before = *getLatticeFor(
+        point, getProgramPointAfter(predecessor->getTerminator()));
     // Merge in the state from the predecessor's terminator.
-    join(after, *getLatticeFor(
-                    point, getProgramPointAfter(predecessor->getTerminator())));
+    visitBlockTransfer(block, point, predecessor, before, after);
   }
 }
 
@@ -614,7 +615,9 @@ void AbstractDenseBackwardDataFlowAnalysis::visitBlock(Block *block) {
     LDBG() << "    Meeting state from successor " << successor;
     // Merge in the state from the successor: either the first operation, or the
     // block itself when empty.
-    meet(before, *getLatticeFor(point, getProgramPointBefore(successor)));
+    visitBlockTransfer(block, point, successor,
+                       *getLatticeFor(point, getProgramPointBefore(successor)),
+                       before);
   }
 }
 
diff --git a/mlir/lib/AsmParser/Parser.cpp b/mlir/lib/AsmParser/Parser.cpp
index 82bdb844480f1..74936e32bd9d9 100644
--- a/mlir/lib/AsmParser/Parser.cpp
+++ b/mlir/lib/AsmParser/Parser.cpp
@@ -407,8 +407,8 @@ Parser::parseFloatFromIntegerLiteral(std::optional<APFloat> &result,
                      "hexadecimal float constant out of range for type");
   }
 
-  APInt truncatedValue(typeSizeInBits, intValue.getNumWords(),
-                       intValue.getRawData());
+  APInt truncatedValue(typeSizeInBits,
+                       ArrayRef(intValue.getRawData(), intValue.getNumWords()));
   result.emplace(semantics, truncatedValue);
   return success();
 }
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index ec182f1db48ac..9348d3c172a07 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -865,13 +865,7 @@ struct NVGPUMBarrierArriveLowering
                        adaptor.getMbarId(), rewriter);
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveSharedOp>(op, tokenType,
-                                                                barrier);
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType,
-                                                          barrier);
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveOp>(op, tokenType, barrier);
     return success();
   }
 };
@@ -892,13 +886,8 @@ struct NVGPUMBarrierArriveNoCompleteLowering
     Type tokenType = getTypeConverter()->convertType(
         nvgpu::MBarrierTokenType::get(op->getContext()));
     Value count = truncToI32(b, adaptor.getCount());
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteSharedOp>(
-          op, tokenType, barrier, count);
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
-          op, tokenType, barrier, count);
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierArriveNocompleteOp>(
+        op, tokenType, barrier, count);
     return success();
   }
 };
@@ -915,13 +904,8 @@ struct NVGPUMBarrierTestWaitLowering
         getMbarrierPtr(b, op.getBarriers().getType(), adaptor.getBarriers(),
                        adaptor.getMbarId(), rewriter);
     Type retType = rewriter.getI1Type();
-    if (isMbarrierShared(op.getBarriers().getType())) {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitSharedOp>(
-          op, retType, barrier, adaptor.getToken());
-    } else {
-      rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(
-          op, retType, barrier, adaptor.getToken());
-    }
+    rewriter.replaceOpWithNewOp<NVVM::MBarrierTestWaitOp>(op, retType, barrier,
+                                                          adaptor.getToken());
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index 91c1aa55fdb4e..1b4d1a42614ea 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -97,57 +97,23 @@ static LogicalResult transferPreconditions(PatternRewriter &rewriter,
   return success();
 }
 
-static xegpu::CreateNdDescOp
-createNdDescriptor(PatternRewriter &rewriter, Location loc,
-                   xegpu::TensorDescType descType, TypedValue<MemRefType> src,
-                   Operation::operand_range offsets) {
+static xegpu::CreateNdDescOp createNdDescriptor(PatternRewriter &rewriter,
+                                                Location loc,
+                                                xegpu::TensorDescType descType,
+                                                TypedValue<MemRefType> src) {
   MemRefType srcTy = src.getType();
   auto [strides, offset] = srcTy.getStridesAndOffset();
 
   xegpu::CreateNdDescOp ndDesc;
   if (srcTy.hasStaticShape()) {
-    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src,
-                                           getAsOpFoldResult(offsets));
+    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src);
   } else {
     // In case of any dynamic shapes, source's shape and strides have to be
     // explicitly provided.
-    SmallVector<Value> sourceDims;
-    unsigned srcRank = srcTy.getRank();
-    for (unsigned i = 0; i < srcRank; ++i)
-      sourceDims.push_back(memref::DimOp::create(rewriter, loc, src, i));
-
-    SmallVector<int64_t> constOffsets;
-    SmallVector<Value> dynOffsets;
-    for (Value offset : offsets) {
-      std::optional<int64_t> staticVal = getConstantIntValue(offset);
-      if (!staticVal)
-        dynOffsets.push_back(offset);
-      constOffsets.push_back(staticVal.value_or(ShapedType::kDynamic));
-    }
-
-    SmallVector<Value> dynShapes;
-    for (auto [idx, shape] : llvm::enumerate(srcTy.getShape())) {
-      if (shape == ShapedType::kDynamic)
-        dynShapes.push_back(sourceDims[idx]);
-    }
-
-    // Compute strides in reverse order.
-    SmallVector<Value> dynStrides;
-    Value accStride = arith::ConstantIndexOp::create(rewriter, loc, 1);
-    // Last stride is guaranteed to be static and unit.
-    for (int i = static_cast<int>(strides.size()) - 2; i >= 0; --i) {
-      accStride =
-          arith::MulIOp::create(rewriter, loc, accStride, sourceDims[i + 1]);
-      if (strides[i] == ShapedType::kDynamic)
-        dynStrides.push_back(accStride);
-    }
-    std::reverse(dynStrides.begin(), dynStrides.end());
-
-    ndDesc = xegpu::CreateNdDescOp::create(
-        rewriter, loc, descType, src, dynOffsets, dynShapes, dynStrides,
-        DenseI64ArrayAttr::get(rewriter.getContext(), constOffsets),
-        DenseI64ArrayAttr::get(rewriter.getContext(), srcTy.getShape()),
-        DenseI64ArrayAttr::get(rewriter.getContext(), strides));
+    auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, src);
+    ndDesc = xegpu::CreateNdDescOp::create(rewriter, loc, descType, src,
+                                           meta.getConstifiedMixedSizes(),
+                                           meta.getConstifiedMixedStrides());
   }
 
   return ndDesc;
@@ -392,6 +358,62 @@ static Value computeOffsets(PatternRewriter &rewriter, OpType gatScatOp,
       .getResult();
 }
 
+// Collapses shapes of a nD memref to the target rank while applying offsets for
+// the collapsed dimensions. Returns the new memref value and the remaining
+// offsets for the last targetRank dimensions. For example:
+//   input: %memref = memref<2x4x8x32xf32>, offsets=[%i0, %i1, %i2, %i3],
+//   output: %memref[%i0, %i1, 0, 0] -> memref<8x32xf32>, offsets: [%i2, %i3]
+static std::pair<Value, SmallVector<OpFoldResult>>
+convertMemrefAndOffsetsToTargetRank(PatternRewriter &rewriter, Location loc,
+                                    Value memref,
+                                    SmallVector<OpFoldResult> offsets,
+                                    int64_t targetRank) {
+  auto memrefType = cast<MemRefType>(memref.getType());
+  unsigned rank = memrefType.getRank();
+
+  if (rank <= targetRank)
+    return {memref, offsets};
+
+  int64_t numCombinedDims = rank - targetRank;
+  SmallVector<OpFoldResult> subviewOffsets;
+  SmallVector<OpFoldResult> subviewSizes;
+  SmallVector<OpFoldResult> subviewStrides;
+
+  // For the combined dimensions: use the provided offsets, size=1, stride=1
+  for (unsigned i = 0; i < numCombinedDims; ++i) {
+    subviewOffsets.push_back(offsets[i]);
+    subviewSizes.push_back(rewriter.getI64IntegerAttr(1));
+    subviewStrides.push_back(rewriter.getI64IntegerAttr(1));
+  }
+
+  // For the last targetRank dimensions: offset=0, use full size, stride=1
+  SmallVector<int64_t> resultShape;
+  auto originalShape = memrefType.getShape();
+  auto meta = memref::ExtractStridedMetadataOp::create(rewriter, loc, memref);
+  for (unsigned i = numCombinedDims; i < rank; ++i) {
+    subviewOffsets.push_back(rewriter.getI64IntegerAttr(0));
+    if (ShapedType::isDynamic(originalShape[i])) {
+      subviewSizes.push_back(meta.getSizes()[i]);
+      resultShape.push_back(ShapedType::kDynamic);
+    } else {
+      subviewSizes.push_back(rewriter.getI64IntegerAttr(originalShape[i]));
+      resultShape.push_back(originalShape[i]);
+    }
+    subviewStrides.push_back(rewriter.getI64IntegerAttr(1));
+  }
+
+  auto resultType = memref::SubViewOp::inferRankReducedResultType(
+      resultShape, memrefType, subviewOffsets, subviewSizes, subviewStrides);
+  auto subviewOp =
+      memref::SubViewOp::create(rewriter, loc, resultType, memref,
+                                subviewOffsets, subviewSizes, subviewStrides);
+
+  // Return the remaining offsets for the last targetRank dimensions
+  SmallVector<OpFoldResult> newOffsets(offsets.begin() + numCombinedDims,
+                                       offsets.end());
+  return {subviewOp.getResult(), newOffsets};
+}
+
 template <
     typename OpType,
     typename = std::enable_if_t<llvm::is_one_of<
@@ -435,7 +457,8 @@ static LogicalResult lowerToScatteredLoadOp(vector::TransferReadOp readOp,
       /*chunk_size=*/IntegerAttr{},
       /*l1_hint=*/xegpu::CachePolicyAttr{},
       /*l2_hint=*/xegpu::CachePolicyAttr{},
-      /*l3_hint=*/xegpu::CachePolicyAttr{});
+      /*l3_hint=*/xegpu::CachePolicyAttr{},
+      /*layout=*/nullptr);
 
   rewriter.replaceOp(readOp, gatherOp.getResult());
   return success();
@@ -469,7 +492,8 @@ static LogicalResult lowerToScatteredStoreOp(vector::TransferWriteOp writeOp,
                                 /*chunk_size=*/IntegerAttr{},
                                 /*l1_hint=*/xegpu::CachePolicyAttr{},
                                 /*l2_hint=*/xegpu::CachePolicyAttr{},
-                                /*l3_hint=*/xegpu::CachePolicyAttr{});
+                                /*l3_hint=*/xegpu::CachePolicyAttr{},
+                                /*layout=*/nullptr);
   rewriter.eraseOp(writeOp);
   return success();
 }
@@ -523,18 +547,19 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
         descShape, elementType, /*array_length=*/1,
         /*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global);
 
-    xegpu::CreateNdDescOp ndDesc =
-        createNdDescriptor(rewriter, loc, descType,
-                           dyn_cast<TypedValue<MemRefType>>(readOp.getBase()),
-                           readOp.getIndices());
-
     DenseI64ArrayAttr transposeAttr =
         !isTransposeLoad ? nullptr
                          : DenseI64ArrayAttr::get(rewriter.getContext(),
                                                   ArrayRef<int64_t>{1, 0});
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, readOp.getBase(), getAsOpFoldResult(readOp.getIndices()),
+        vecTy.getRank());
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
-    auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc,
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
+    auto loadOp = xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices,
                                           /*packed=*/nullptr, transposeAttr,
                                           /*l1_hint=*/hint,
                                           /*l2_hint=*/hint, /*l3_hint=*/hint);
@@ -575,21 +600,23 @@ struct TransferWriteLowering
     if (!map.isMinorIdentity())
       return rewriter.notifyMatchFailure(writeOp, "Expects identity map");
 
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, writeOp.getBase(),
+        getAsOpFoldResult(writeOp.getIndices()), vecTy.getRank());
+
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(),
         /*array_length=*/1, /*boundary_check=*/writeOp.hasOutOfBoundsDim(),
         xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc =
-        createNdDescriptor(rewriter, loc, descType,
-                           dyn_cast<TypedValue<MemRefType>>(writeOp.getBase()),
-                           writeOp.getIndices());
-
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
-    auto storeOp =
-        xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(), ndDesc,
-                                 /*l1_hint=*/hint,
-                                 /*l2_hint=*/hint, /*l3_hint=*/hint);
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
+    auto storeOp = xegpu::StoreNdOp::create(rewriter, loc, writeOp.getVector(),
+                                            ndDesc, indices,
+                                            /*l1_hint=*/hint,
+                                            /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(writeOp, storeOp);
 
     return success();
@@ -621,7 +648,8 @@ struct GatherLowering : public OpRewritePattern<vector::GatherOp> {
         /*chunk_size=*/IntegerAttr{},
         /*l1_hint=*/xegpu::CachePolicyAttr{},
         /*l2_hint=*/xegpu::CachePolicyAttr{},
-        /*l3_hint=*/xegpu::CachePolicyAttr{});
+        /*l3_hint=*/xegpu::CachePolicyAttr{},
+        /*layout=*/nullptr);
 
     auto selectOp =
         arith::SelectOp::create(rewriter, loc, gatherOp.getMask(),
@@ -655,7 +683,8 @@ struct ScatterLowering : public OpRewritePattern<vector::ScatterOp> {
                                   /*chunk_size=*/IntegerAttr{},
                                   /*l1_hint=*/xegpu::CachePolicyAttr{},
                                   /*l2_hint=*/xegpu::CachePolicyAttr{},
-                                  /*l3_hint=*/xegpu::CachePolicyAttr{});
+                                  /*l3_hint=*/xegpu::CachePolicyAttr{},
+                                  /*layout=*/nullptr);
     rewriter.eraseOp(scatterOp);
     return success();
   }
@@ -674,19 +703,24 @@ struct LoadLowering : public OpRewritePattern<vector::LoadOp> {
 
     // Boundary check is available only for block instructions.
     bool boundaryCheck = vecTy.getRank() > 1;
+    // By default, no specific caching policy is assigned.
+    xegpu::CachePolicyAttr hint = nullptr;
+
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, loadOp.getBase(), getAsOpFoldResult(loadOp.getIndices()),
+        vecTy.getRank());
 
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(), /*array_length=*/1,
         boundaryCheck, xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
-        rewriter, loc, descType, loadOp.getBase(), loadOp.getIndices());
 
-    // By default, no specific caching policy is assigned.
-    xegpu::CachePolicyAttr hint = nullptr;
-    auto loadNdOp = xegpu::LoadNdOp::create(
-        rewriter, loc, vecTy, ndDesc, /*packed=*/nullptr, /*transpose=*/nullptr,
-        /*l1_hint=*/hint,
-        /*l2_hint=*/hint, /*l3_hint=*/hint);
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+    auto loadNdOp =
+        xegpu::LoadNdOp::create(rewriter, loc, vecTy, ndDesc, indices,
+                                /*packed=*/nullptr, /*transpose=*/nullptr,
+                                /*l1_hint=*/hint,
+                                /*l2_hint=*/hint, /*l3_hint=*/hint);
     rewriter.replaceOp(loadOp, loadNdOp);
 
     return success();
@@ -708,18 +742,24 @@ struct StoreLowering : public OpRewritePattern<vector::StoreOp> {
     // Boundary check is available only for block instructions.
     bool boundaryCheck = vecTy.getRank() > 1;
 
+    auto [src, indices] = convertMemrefAndOffsetsToTargetRank(
+        rewriter, loc, storeOp.getBase(),
+        getAsOpFoldResult(storeOp.getIndices()), vecTy.getRank());
+
     auto descType = xegpu::TensorDescType::get(
         vecTy.getShape(), vecTy.getElementType(),
         /*array_length=*/1, boundaryCheck, xegpu::MemorySpace::Global);
-    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
-        rewriter, loc, descType, storeOp.getBase(), storeOp.getIndices());
 
     // By default, no specific caching policy is assigned.
     xegpu::CachePolicyAttr hint = nullptr;
+    xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
+        rewriter, loc, descType, dyn_cast<TypedValue<MemRefType>>(src));
+
     auto storeNdOp =
-        xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc,
+        xegpu::StoreNdOp::create(rewriter, loc, vector, ndDesc, indices,
                                  /*l1_hint=*/hint,
                                  /*l2_hint=*/hint, /*l3_hint=*/hint);
+
     rewriter.replaceOp(storeOp, storeNdOp);
 
     return success();
diff --git a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
index 33e8f2ed1f6ed..de552ce22ef62 100644
--- a/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
+++ b/mlir/lib/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp
@@ -562,6 +562,8 @@ class LoadStoreMatrixToXeVMPattern : public OpConversionPattern<OpType> {
     VectorType valOrResVecTy = dyn_cast<VectorType>(data.getType());
     if (!valOrResVecTy)
       valOrResVecTy = VectorType::get(1, data.getType());
+    if (valOrResVecTy.getShape().size() != 1)
+      return rewriter.notifyMatchFailure(op, "Expected 1D data vector.");
 
     int64_t elemBitWidth =
         valOrResVecTy.getElementType().getIntOrFloatBitWidth();
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
index 4d2d8738aa4ad..3d1a73417d1ea 100644
--- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
@@ -66,9 +66,10 @@ static Value getSupportedReduction(AffineForOp forOp, unsigned pos,
           .Case([](arith::MaxSIOp) { return arith::AtomicRMWKind::maxs; })
           .Case([](arith::MinUIOp) { return arith::AtomicRMWKind::minu; })
           .Case([](arith::MaxUIOp) { return arith::AtomicRMWKind::maxu; })
+          .Case([](arith::XOrIOp) { return arith::AtomicRMWKind::xori; })
+          .Case([](arith::MaxNumFOp) { return arith::AtomicRMWKind::maxnumf; })
+          .Case([](arith::MinNumFOp) { return arith::AtomicRMWKind::minnumf; })
           .Default([](Operation *) -> std::optional<arith::AtomicRMWKind> {
-            // TODO: AtomicRMW supports other kinds of reductions this is
-            // currently not detecting, add those when the need arises.
             return std::nullopt;
           });
   if (!maybeKind)
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index a5ffb9e77fa9d..d43f8815be16d 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -365,6 +365,59 @@ LogicalResult ConvertF4x2ToF16x2Op::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Stochastic Rounding Conversion Ops
+//===----------------------------------------------------------------------===//
+
+LogicalResult ConvertF32x2ToF16x2Op::verify() {
+  if (getRnd() != FPRoundingMode::RS)
+    return emitOpError("Only RS rounding mode is supported for "
+                       "conversions from f32x2 to f16x2.");
+  return success();
+}
+
+LogicalResult ConvertF32x2ToBF16x2Op::verify() {
+  if (getRnd() != FPRoundingMode::RS)
+    return emitOpError("Only RS rounding mode is supported for "
+                       "conversions from f32x2 to bf16x2.");
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF8x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float8E4M3FNType, mlir::Float8E5M2Type>(getDstTy()))
+    return emitOpError("Only ")
+           << mlir::Float8E4M3FNType::get(ctx) << " and "
+           << mlir::Float8E5M2Type::get(ctx)
+           << " types are supported for conversions from f32x4 to f8x4.";
+
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF6x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float6E2M3FNType, mlir::Float6E3M2FNType>(getDstTy()))
+    return emitOpError("Only ")
+           << mlir::Float6E2M3FNType::get(ctx) << " and "
+           << mlir::Float6E3M2FNType::get(ctx)
+           << " types are supported for conversions from f32x4 to f6x4.";
+
+  return success();
+}
+
+LogicalResult ConvertF32x4ToF4x4Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<mlir::Float4E2M1FNType>(getDstTy()))
+    return emitOpError("Only ") << mlir::Float4E2M1FNType::get(ctx)
+                                << " type is supported for conversions from "
+                                   "f32x4 to f4x4.";
+
+  return success();
+}
+
 LogicalResult BulkStoreOp::verify() {
   if (getInitVal() != 0)
     return emitOpError("only 0 is supported for initVal, got ") << getInitVal();
@@ -867,15 +920,40 @@ LogicalResult MmaOp::verify() {
 }
 
 LogicalResult ShflOp::verify() {
-  if (!(*this)->getAttrOfType<UnitAttr>("return_value_and_is_valid"))
-    return success();
-  auto type = llvm::dyn_cast<LLVM::LLVMStructType>(getType());
-  auto elementType = (type && type.getBody().size() == 2)
-                         ? llvm::dyn_cast<IntegerType>(type.getBody()[1])
-                         : nullptr;
-  if (!elementType || elementType.getWidth() != 1)
-    return emitError("expected return type to be a two-element struct with "
-                     "i1 as the second element");
+  auto returnStructType = llvm::dyn_cast<LLVM::LLVMStructType>(getType());
+
+  auto verifyTypeError = [&](Twine desc, Type expectedType,
+                             Type actualType) -> LogicalResult {
+    return emitOpError("expected " + desc + " to be of type ")
+           << expectedType << " but got " << actualType << " instead";
+  };
+
+  if (returnStructType) {
+    if (!getReturnValueAndIsValid())
+      return emitOpError("\"return_value_and_is_valid\" attribute must be "
+                         "specified when the return type is a struct type");
+
+    if (returnStructType.getBody().size() != 2)
+      return emitOpError("expected return type to be a two-element struct");
+
+    llvm::ArrayRef<Type> returnStruct = returnStructType.getBody();
+    auto resultType = returnStruct[0];
+    if (resultType != getVal().getType())
+      return verifyTypeError("first element in the returned struct",
+                             getVal().getType(), resultType);
+
+    auto predicateType = returnStruct[1];
+    if (!predicateType.isInteger(1))
+      return verifyTypeError("second element in the returned struct",
+                             mlir::IntegerType::get(getContext(), 1),
+                             predicateType);
+  } else {
+    if (getReturnValueAndIsValid())
+      return emitOpError("expected return type to be a two-element struct");
+
+    if (getType() != getVal().getType())
+      return verifyTypeError("return type", getVal().getType(), getType());
+  }
   return success();
 }
 
@@ -1577,6 +1655,43 @@ LogicalResult NVVM::ClusterLaunchControlQueryCancelOp::verify() {
   return success();
 }
 
+LogicalResult NVVM::ReduxOp::verify() {
+  mlir::Type reduxType = getType();
+
+  if (!reduxType.isF32()) {
+    if (getAbs())
+      return emitOpError("abs attribute is supported only for f32 type");
+    if (getNan())
+      return emitOpError("nan attribute is supported only for f32 type");
+  }
+
+  NVVM::ReduxKind kind = getKind();
+  switch (kind) {
+  case NVVM::ReduxKind::ADD:
+  case NVVM::ReduxKind::AND:
+  case NVVM::ReduxKind::OR:
+  case NVVM::ReduxKind::XOR:
+  case NVVM::ReduxKind::MAX:
+  case NVVM::ReduxKind::MIN:
+  case NVVM::ReduxKind::UMAX:
+  case NVVM::ReduxKind::UMIN:
+    if (!reduxType.isInteger(32))
+      return emitOpError("'")
+             << stringifyEnum(kind) << "' redux kind unsupported with "
+             << reduxType << " type. Only supported type is 'i32'.";
+    break;
+  case NVVM::ReduxKind::FMIN:
+  case NVVM::ReduxKind::FMAX:
+    if (!reduxType.isF32())
+      return emitOpError("'")
+             << stringifyEnum(kind) << "' redux kind unsupported with "
+             << reduxType << " type. Only supported type is 'f32'.";
+    break;
+  }
+
+  return success();
+}
+
 /// Packs the given `field` into the `result`.
 /// The `result` is 64-bits and each `field` can be 32-bits or narrower.
 static llvm::Value *
@@ -1637,15 +1752,21 @@ std::string NVVM::MBarrierInitOp::getPtx() {
 // getIntrinsicID/getIntrinsicIDAndArgs methods
 //===----------------------------------------------------------------------===//
 
+static bool isPtrInAddrSpace(mlir::Value ptr, NVVMMemorySpace targetAS) {
+  auto ptrTy = llvm::cast<LLVM::LLVMPointerType>(ptr.getType());
+  return ptrTy.getAddressSpace() == static_cast<unsigned>(targetAS);
+}
+
+static bool isPtrInSharedCTASpace(mlir::Value ptr) {
+  return isPtrInAddrSpace(ptr, NVVMMemorySpace::Shared);
+}
+
 mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
     Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
   auto thisOp = cast<NVVM::MBarrierInitOp>(op);
-  unsigned addressSpace =
-      llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
-          .getAddressSpace();
-  llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
-                               ? llvm::Intrinsic::nvvm_mbarrier_init_shared
-                               : llvm::Intrinsic::nvvm_mbarrier_init;
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared ? llvm::Intrinsic::nvvm_mbarrier_init_shared
+                                    : llvm::Intrinsic::nvvm_mbarrier_init;
 
   // Fill the Intrinsic Args
   llvm::SmallVector<llvm::Value *> args;
@@ -1658,16 +1779,72 @@ mlir::NVVM::IDArgPair MBarrierInitOp::getIntrinsicIDAndArgs(
 mlir::NVVM::IDArgPair MBarrierInvalOp::getIntrinsicIDAndArgs(
     Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
   auto thisOp = cast<NVVM::MBarrierInvalOp>(op);
-  unsigned addressSpace =
-      llvm::cast<LLVM::LLVMPointerType>(thisOp.getAddr().getType())
-          .getAddressSpace();
-  llvm::Intrinsic::ID id = (addressSpace == NVVMMemorySpace::Shared)
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
                                ? llvm::Intrinsic::nvvm_mbarrier_inval_shared
                                : llvm::Intrinsic::nvvm_mbarrier_inval;
 
   return {id, {mt.lookupValue(thisOp.getAddr())}};
 }
 
+mlir::NVVM::IDArgPair MBarrierArriveOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierArriveOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
+                               ? llvm::Intrinsic::nvvm_mbarrier_arrive_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_arrive;
+
+  return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
+mlir::NVVM::IDArgPair MBarrierArriveNocompleteOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierArriveNocompleteOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id =
+      isShared ? llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete_shared
+               : llvm::Intrinsic::nvvm_mbarrier_arrive_noComplete;
+  // Fill the Intrinsic Args
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(thisOp.getAddr()));
+  args.push_back(mt.lookupValue(thisOp.getCount()));
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair MBarrierTestWaitOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::MBarrierTestWaitOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+  llvm::Intrinsic::ID id = isShared
+                               ? llvm::Intrinsic::nvvm_mbarrier_test_wait_shared
+                               : llvm::Intrinsic::nvvm_mbarrier_test_wait;
+  // Fill the Intrinsic Args
+  llvm::SmallVector<llvm::Value *> args;
+  args.push_back(mt.lookupValue(thisOp.getAddr()));
+  args.push_back(mt.lookupValue(thisOp.getState()));
+
+  return {id, std::move(args)};
+}
+
+mlir::NVVM::IDArgPair CpAsyncMBarrierArriveOp::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto thisOp = cast<NVVM::CpAsyncMBarrierArriveOp>(op);
+  bool isShared = isPtrInSharedCTASpace(thisOp.getAddr());
+
+  llvm::Intrinsic::ID id;
+  if (thisOp.getNoinc()) {
+    id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc_shared
+                  : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_noinc;
+  } else {
+    id = isShared ? llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive_shared
+                  : llvm::Intrinsic::nvvm_cp_async_mbarrier_arrive;
+  }
+
+  return {id, {mt.lookupValue(thisOp.getAddr())}};
+}
+
 #define CP_ASYNC_ID_IMPL(mod, size, suffix)                                    \
   llvm::Intrinsic::nvvm_cp_async_##mod##_shared_global_##size##suffix
 
@@ -2469,6 +2646,85 @@ Tcgen05CommitOp::getIntrinsicIDAndArgs(Operation &op,
     return TCGEN05_CP_2CTA(shape_mc, , is_2cta);                               \
   }()
 
+llvm::Intrinsic::ID ConvertF32x2ToF16x2Op::getIntrinsicID() {
+  bool hasRelu = getRelu();
+  bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE);
+
+  if (hasRelu && hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu_satfinite;
+  if (hasRelu)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_relu;
+  if (hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2f16x2_rs_satfinite;
+  return llvm::Intrinsic::nvvm_ff2f16x2_rs;
+}
+
+llvm::Intrinsic::ID ConvertF32x2ToBF16x2Op::getIntrinsicID() {
+  bool hasRelu = getRelu();
+  bool hasSatFinite = (getSat() == NVVM::SaturationMode::SATFINITE);
+
+  if (hasRelu && hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu_satfinite;
+  if (hasRelu)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_relu;
+  if (hasSatFinite)
+    return llvm::Intrinsic::nvvm_ff2bf16x2_rs_satfinite;
+  return llvm::Intrinsic::nvvm_ff2bf16x2_rs;
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF8x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float8E4M3FNType>([&](mlir::Float8E4M3FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite;
+      })
+      .Case<mlir::Float8E5M2Type>([&](mlir::Float8E5M2Type) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F8 type in ConvertF32x4ToF8x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF6x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float6E2M3FNType>([&](mlir::Float6E2M3FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite;
+      })
+      .Case<mlir::Float6E3M2FNType>([&](mlir::Float6E3M2FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F6 type in ConvertF32x4ToF6x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
+llvm::Intrinsic::ID ConvertF32x4ToF4x4Op::getIntrinsicID() {
+  mlir::Type dstTy = getDstTy();
+  bool hasRelu = getRelu();
+
+  return llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(dstTy)
+      .Case<mlir::Float4E2M1FNType>([&](mlir::Float4E2M1FNType) {
+        return hasRelu ? llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite
+                       : llvm::Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite;
+      })
+      .Default([](mlir::Type) {
+        llvm_unreachable("Invalid F4 type in ConvertF32x4ToF4x4Op");
+        return llvm::Intrinsic::not_intrinsic;
+      });
+}
+
 llvm::Intrinsic::ID Tcgen05CpOp::getIntrinsicID(Operation &op) {
   auto curOp = cast<NVVM::Tcgen05CpOp>(op);
   bool is2CTA = curOp.getGroup() == CTAGroupKind::CTA_2;
@@ -2508,6 +2764,9 @@ LogicalResult Tcgen05LdOp::verify() {
   if (getShape() == NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && !getOffset())
     result = emitError("shape 16x32bx2 requires offset argument");
 
+  if (getShape() != NVVM::Tcgen05LdStShape::SHAPE_16X32BX2 && getOffset())
+    result = emitError("offset argument is only supported for shape 16x32bx2");
+
   auto resTy = getRes().getType();
   unsigned resLen = isa<VectorType>(resTy)
                         ? llvm::cast<VectorType>(resTy).getNumElements()
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 35eba724a9059..b2f1d840f3bca 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -3068,8 +3068,12 @@ LogicalResult acc::LoopOp::verify() {
   if (getRegion().empty())
     return emitError("expected non-empty body.");
 
-  // When it is container-like - it is expected to hold a loop-like operation.
-  if (isContainerLike()) {
+  if (getUnstructured()) {
+    if (!isContainerLike())
+      return emitError(
+          "unstructured acc.loop must not have induction variables");
+  } else if (isContainerLike()) {
+    // When it is container-like - it is expected to hold a loop-like operation.
     // Obtain the maximum collapse count - we use this to check that there
     // are enough loops contained.
     uint64_t collapseCount = getCollapseValue().value_or(1);
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 2946b53c8cb36..881e256a8797b 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -2565,6 +2565,39 @@ struct ConvertTrivialIfToSelect : public OpRewritePattern<IfOp> {
 struct ConditionPropagation : public OpRewritePattern<IfOp> {
   using OpRewritePattern<IfOp>::OpRewritePattern;
 
+  /// Kind of parent region in the ancestor cache.
+  enum class Parent { Then, Else, None };
+
+  /// Returns the kind of region ("then", "else", or "none") of the
+  /// IfOp that the given region is transitively nested in. Updates
+  /// the cache accordingly.
+  static Parent getParentType(Region *toCheck, IfOp op,
+                              DenseMap<Region *, Parent> &cache,
+                              Region *endRegion) {
+    SmallVector<Region *> seen;
+    while (toCheck != endRegion) {
+      auto found = cache.find(toCheck);
+      if (found != cache.end())
+        return found->second;
+      seen.push_back(toCheck);
+      if (&op.getThenRegion() == toCheck) {
+        for (Region *region : seen)
+          cache[region] = Parent::Then;
+        return Parent::Then;
+      }
+      if (&op.getElseRegion() == toCheck) {
+        for (Region *region : seen)
+          cache[region] = Parent::Else;
+        return Parent::Else;
+      }
+      toCheck = toCheck->getParentRegion();
+    }
+
+    for (Region *region : seen)
+      cache[region] = Parent::None;
+    return Parent::None;
+  }
+
   LogicalResult matchAndRewrite(IfOp op,
                                 PatternRewriter &rewriter) const override {
     // Early exit if the condition is constant since replacing a constant
@@ -2580,9 +2613,12 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
     Value constantTrue = nullptr;
     Value constantFalse = nullptr;
 
+    DenseMap<Region *, Parent> cache;
     for (OpOperand &use :
          llvm::make_early_inc_range(op.getCondition().getUses())) {
-      if (op.getThenRegion().isAncestor(use.getOwner()->getParentRegion())) {
+      switch (getParentType(use.getOwner()->getParentRegion(), op, cache,
+                            op.getCondition().getParentRegion())) {
+      case Parent::Then: {
         changed = true;
 
         if (!constantTrue)
@@ -2591,8 +2627,9 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantTrue); });
-      } else if (op.getElseRegion().isAncestor(
-                     use.getOwner()->getParentRegion())) {
+        break;
+      }
+      case Parent::Else: {
         changed = true;
 
         if (!constantFalse)
@@ -2601,6 +2638,10 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantFalse); });
+        break;
+      }
+      case Parent::None:
+        break;
       }
     }
 
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index ae3423c40040d..daef0ba02100a 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -717,7 +717,15 @@ Value mlir::vector::getVectorReductionOp(arith::AtomicRMWKind op,
   case arith::AtomicRMWKind::ori:
     return vector::ReductionOp::create(builder, vector.getLoc(),
                                        CombiningKind::OR, vector);
-  // TODO: Add remaining reduction operations.
+  case arith::AtomicRMWKind::minnumf:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MINNUMF, vector);
+  case arith::AtomicRMWKind::maxnumf:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::MAXNUMF, vector);
+  case arith::AtomicRMWKind::xori:
+    return vector::ReductionOp::create(builder, vector.getLoc(),
+                                       CombiningKind::XOR, vector);
   default:
     (void)emitOptionalError(loc, "Reduction operation type not supported");
     break;
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 83406c8c75dcf..fb5d1e758dbd1 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -37,55 +37,61 @@ void XeGPUDialect::initialize() {
       >();
 }
 
-/// Generates instructions to compute offsets for a subgroup identified by
-/// its multidimensional indices (sgId), using the specified subgroup layout
-/// (sgLayout), subgroup data dimensions (sizePerSg), and the overall data
-/// dimensions (sizePerWg).
+// A `srcShape` consists of N distribution units, each being `subShapesLayout` x
+// `subShape`. A `delinearizedId` is used to identify a particular `subShape`
+// within each distribution unit.
+// Example:
+// WG data is 128x256. SG data is 16x32, in 4x2 layout, this gives a
+// distribution unit of shape 64x64, we have 2x4 such distribution units.
+// `delinearizedId` is used to identify a 16x32 of a subgroup in each
+// distribution unit.
 static SmallVector<SmallVector<Value>>
-genOffsetsComputingInsts(OpBuilder &builder, Location loc,
-                         SmallVector<Value> sgId, ArrayRef<int64_t> sgLayout,
-                         ArrayRef<int64_t> sizePerSg,
-                         ArrayRef<int64_t> sizePerWg) {
-
-  SmallVector<SmallVector<Value>> offsets;
+genCoordinates(OpBuilder &builder, Location loc,
+               SmallVector<Value> delinearizedId,
+               ArrayRef<int64_t> subShapesLayout, ArrayRef<int64_t> subShape,
+               ArrayRef<int64_t> srcShape) {
+  SmallVector<SmallVector<Value>> coordinates;
+
+  // A distribution unit must be less than or equal to `srcShape`
+  SmallVector<int64_t> distUnitShape = llvm::map_to_vector(
+      llvm::zip_equal(srcShape,
+                      computeElementwiseMul(subShapesLayout, subShape)),
+      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
 
-  // nd local offset, localOffset[i] = sgId[i] * sizePerSg[i]
-  SmallVector<Value> localOffsets = llvm::map_to_vector(
-      llvm::zip(sgId, sizePerSg), [&](const auto &t) -> Value {
+  // Get the offset of `subShape` within a distribution unit.
+  SmallVector<Value> distUnitLocalOffset = llvm::map_to_vector(
+      llvm::zip(delinearizedId, subShape), [&](const auto &t) -> Value {
         return builder.createOrFold<index::MulOp>(
             loc, std::get<0>(t),
             builder.createOrFold<arith::ConstantIndexOp>(loc, std::get<1>(t)));
       });
 
-  // distUnit[i] is the minimum value between sizePerWg[i] and
-  // sgLayout[i] * sizePerSg[i]
-  SmallVector<int64_t> distUnit = llvm::map_to_vector(
-      llvm::zip_equal(sizePerWg, computeElementwiseMul(sgLayout, sizePerSg)),
-      [](const auto &t) { return std::min(std::get<0>(t), std::get<1>(t)); });
-
+  // For each dist unit
   for (SmallVector<int64_t> unitOffs :
-       StaticTileOffsetRange(sizePerWg, distUnit)) {
+       StaticTileOffsetRange(srcShape, distUnitShape)) {
+    // Get dist unit offset within `srcShape`.
     SmallVector<Value> base =
         llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value {
           return arith::ConstantIndexOp::create(builder, loc, d);
         });
-
-    SmallVector<Value> adds = llvm::map_to_vector(
-        llvm::zip_equal(base, localOffsets), [&](const auto &t) -> Value {
-          return builder.createOrFold<arith::AddIOp>(loc, std::get<0>(t),
-                                                     std::get<1>(t));
-        });
-
+    // Calculate `subShape` offset within `srcShape`.
+    SmallVector<Value> adds =
+        llvm::map_to_vector(llvm::zip_equal(base, distUnitLocalOffset),
+                            [&](const auto &t) -> Value {
+                              return builder.createOrFold<arith::AddIOp>(
+                                  loc, std::get<0>(t), std::get<1>(t));
+                            });
+    // Do not go beyond `srcShape` bounds.
     SmallVector<Value> mods = llvm::map_to_vector(
-        llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value {
+        llvm::zip_equal(adds, srcShape), [&](const auto &t) -> Value {
           return builder.createOrFold<index::RemUOp>(
               loc, std::get<0>(t),
               arith::ConstantIndexOp::create(builder, loc, std::get<1>(t)));
         });
 
-    offsets.push_back(mods);
+    coordinates.push_back(mods);
   }
-  return offsets;
+  return coordinates;
 }
 
 // Checks if the given shape can be evenly distributed based on the layout
@@ -272,56 +278,117 @@ LayoutAttr::verify(llvm::function_ref<mlir::InFlightDiagnostic()> emitError,
 }
 
 FailureOr<SmallVector<Value>>
-LayoutAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
-                                  Value linearId) {
-  // delinearizeSubgroupId is only available for
-  // workgroup-level layout attribute
-  if (!isForWorkgroup())
+LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) {
+
+  SmallVector<int64_t> sgLayoutInt;
+  if (isForWorkgroup()) {
+    sgLayoutInt = getEffectiveSgLayoutAsInt();
+  } else if (isForSubgroup()) {
+    sgLayoutInt = getEffectiveLaneLayoutAsInt();
+  } else {
     return failure();
+  }
 
-  // TODO: handle order attribute
-  auto hasDefaultOrder = [&]() {
-    DenseI32ArrayAttr order = getOrder();
-    return !order || isIdentityPermutation(llvm::to_vector_of<int64_t>(
-                         llvm::reverse(order.asArrayRef())));
-  };
-  if (!hasDefaultOrder())
-    return mlir::emitError(loc, "order attribute is currently not supported.");
+  DenseI32ArrayAttr orderAttr = getOrder();
 
-  auto dims =
-      llvm::map_to_vector(getEffectiveSgLayoutAsInt(), [&](int64_t d) -> Value {
-        return builder.createOrFold<arith::ConstantIndexOp>(loc, d);
-      });
+  // Handle order attribute
+  SmallVector<int64_t> order;
+  if (orderAttr && !orderAttr.empty()) {
+    order = llvm::to_vector(
+        llvm::map_range(orderAttr.asArrayRef(),
+                        [](int32_t idx) { return static_cast<int64_t>(idx); }));
+  } else {
+    // Default order: [1, 0] for 2D (row-major), [2, 1, 0] for 3D, etc.
+    order = llvm::to_vector(
+        llvm::reverse(llvm::seq<int64_t>(0, sgLayoutInt.size())));
+  }
 
-  return affine::delinearizeIndex(builder, loc, linearId, dims);
+  if (order.size() != sgLayoutInt.size()) {
+    return failure();
+  }
+
+  SmallVector<Value> result(sgLayoutInt.size());
+  Value remaining = linearId;
+
+  /// Process dimensions in the order they appear in the order array
+  /// The first dimension in order is the fastest-changing
+  ///
+  /// Example walkthrough for linearId=22, sgLayout=[2,4,4], order=[2,1,0]:
+  ///
+  /// Initial: remaining=22, dimIdx = order[i], dimSize = sgLayout[dimIdx],
+  /// result=[?,?,?]
+  ///
+  /// i=0 (process columns, dimIdx=2, dimSize=4):
+  ///   result[2] = 22 % 4 = 2  (column coordinate)
+  ///   remaining = 22 / 4 = 5  (5 complete groups of 4 columns processed)
+  ///
+  /// i=1 (process rows, dimIdx=1, dimSize=4):
+  ///   result[1] = 5 % 4 = 1   (row coordinate)
+  ///   remaining = 5 / 4 = 1   (1 complete group of 4 rows processed)
+  ///
+  /// i=2 (process layers, dimIdx=0, dimSize=2):
+  ///   result[0] = 1 % 2 = 1   (layer coordinate)
+  ///   (no remaining update - last iteration)
+  ///
+  /// Final result: [1,1,2] = Layer 1, Row 1, Column 2
+  for (size_t i = 0; i < order.size(); ++i) {
+    int64_t dimIdx = order[i];
+    int64_t dimSize = sgLayoutInt[dimIdx];
+
+    Value dimSizeVal =
+        builder.createOrFold<arith::ConstantIndexOp>(loc, dimSize);
+
+    /// Extract the coordinate for this dimension using modulo operation
+    /// This gives us "how far within this dimension" we are
+    /// e.g., linearId=22, dimSize=4: 22 % 4 = 2 (we're at position 2 within
+    /// this dimension)
+    result[dimIdx] =
+        builder.createOrFold<index::RemUOp>(loc, remaining, dimSizeVal);
+
+    /// Update remaining for the next dimension by removing what we've already
+    /// processed. Division tells us "how many complete groups of this dimension
+    /// we've gone through" e.g., linearId=22, dimSize=4: 22 / 4 = 5 (we've
+    /// completed 5 groups of 4) Skip this for the last iteration since there's
+    /// no next dimension to process
+    if (i < order.size() - 1) {
+      remaining =
+          builder.createOrFold<index::DivUOp>(loc, remaining, dimSizeVal);
+    }
+  }
+  return result;
 }
 
-/// Implements DistributeLayoutAttr::getOffsets to generate
+/// Implements DistributeLayoutAttr::computeDistributedCoords to generate
 /// instructions for computing multi-dimensional offsets when distributed by
 /// LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
-LayoutAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
-                       ArrayRef<int64_t> shape) {
-  if (!isForWorkgroup())
+LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
+                                     Value linearId, ArrayRef<int64_t> shape) {
+  SmallVector<int64_t> layout;
+  SmallVector<int64_t> subShape;
+  if (isForWorkgroup()) {
+    layout = getEffectiveSgLayoutAsInt();
+    subShape = getEffectiveSgDataAsInt();
+  } else if (isForSubgroup()) {
+    layout = getEffectiveLaneLayoutAsInt();
+    subShape = getEffectiveLaneDataAsInt();
+  } else {
     return failure();
-
-  SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
-  SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt();
-  if (sgShape.empty()) {
-    if (auto derivedShape = computeShapeRatio(shape, sgLayout))
-      sgShape = derivedShape.value();
+  }
+  if (subShape.empty()) {
+    if (auto derivedShape = computeShapeRatio(shape, layout))
+      subShape = derivedShape.value();
     else
       return failure();
   }
 
   // delinearize Ids
-  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  auto maybeIds = delinearizeId(builder, loc, linearId);
   if (failed(maybeIds))
     return failure();
-  SmallVector<Value> sgIds = *maybeIds;
+  SmallVector<Value> ids = *maybeIds;
 
-  return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
-                                  shape);
+  return genCoordinates(builder, loc, ids, layout, subShape, shape);
 }
 
 //===----------------------------------------------------------------------===//
@@ -375,34 +442,43 @@ SliceAttr SliceAttr::flatten() const {
 }
 
 FailureOr<SmallVector<Value>>
-SliceAttr::delinearizeSubgroupId(OpBuilder &builder, Location loc,
-                                 Value linearId) {
+SliceAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId) {
   SliceAttr attr = flatten();
   auto parent = dyn_cast<LayoutAttr>(attr.getParent());
-  return parent.delinearizeSubgroupId(builder, loc, linearId);
+  return parent.delinearizeId(builder, loc, linearId);
 }
 
-/// Implements DistributeLayoutAttr::getOffsets to generate
-/// instructions for computing multi-dimensional offsets when distributed by
-/// SliceAttr.
+// Implements DistributeLayoutAttr::computeDistributedCoords to generate
+// instructions for computing multi-dimensional offsets when distributed by
+// LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
-SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
-                      ArrayRef<int64_t> shape) {
+SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
+                                    Value linearId, ArrayRef<int64_t> shape) {
   assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
   if (!isForWorkgroup())
     return failure();
 
-  SmallVector<int64_t> sgLayout = getEffectiveSgLayoutAsInt();
-  SmallVector<int64_t> sgShape = getEffectiveSgDataAsInt();
-  if (sgShape.empty()) {
-    if (auto derivedShape = computeShapeRatio(shape, sgLayout))
-      sgShape = derivedShape.value();
+  SmallVector<int64_t> layout;
+  SmallVector<int64_t> subShape;
+  if (isForWorkgroup()) {
+    layout = getEffectiveSgLayoutAsInt();
+    subShape = getEffectiveSgDataAsInt();
+  } else if (isForSubgroup()) {
+    layout = getEffectiveLaneLayoutAsInt();
+    subShape = getEffectiveLaneDataAsInt();
+  } else {
+    return failure();
+  }
+
+  if (subShape.empty()) {
+    if (auto derivedShape = computeShapeRatio(shape, layout))
+      subShape = derivedShape.value();
     else
       return failure();
   }
 
   // delinearize Ids
-  auto maybeIds = delinearizeSubgroupId(builder, loc, linearId);
+  auto maybeIds = delinearizeId(builder, loc, linearId);
   if (failed(maybeIds))
     return failure();
 
@@ -412,8 +488,7 @@ SliceAttr::getOffsets(OpBuilder &builder, Location loc, Value linearId,
   SmallVector<Value> sgIds =
       XeGPUDialect::slice(ArrayRef<Value>(*maybeIds), dims);
 
-  return genOffsetsComputingInsts(builder, loc, sgIds, sgLayout, sgShape,
-                                  shape);
+  return genCoordinates(builder, loc, sgIds, layout, subShape, shape);
 }
 
 bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index abd12e2e69ac0..4dd10bedc6d84 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -175,13 +175,13 @@ isValidGatherScatterBufferParams(Type offsetsTy, Type maskTy,
 
 LogicalResult
 IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
-                      UnitAttr subgroup_block_io,
+                      UnitAttr subgroup_block_io, DistributeLayoutAttr layout,
                       function_ref<InFlightDiagnostic()> emitError) {
 
   if (!dataTy) {
     if (subgroup_block_io)
       return emitError() << "subgroup_block_io "
-                            "are only allowed when result is a 1D VectorType.";
+                            "are only allowed when result is a VectorType.";
     else
       return success();
   }
@@ -192,15 +192,37 @@ IsValidMatrixOpParams(VectorType dataTy, MemDescType mdescTy,
   ArrayRef<int64_t> dataShape = dataTy.getShape();
   ArrayRef<int64_t> mdescShape = mdescTy.getShape();
 
+  SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
+  ArrayAttr strideAttr = mdescTy.getStrideAttr();
+  SmallVector<int64_t> strides;
+  for (Attribute attr : strideAttr.getValue()) {
+    strides.push_back(cast<IntegerAttr>(attr).getInt());
+  }
+  if (subgroup_block_io && layout) {
+    auto laneData = layout.getEffectiveLaneDataAsInt();
+    auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+    if (!laneData.empty()) {
+      bool isLaneDataContiguous =
+          std::all_of(laneData.begin(), std::prev(laneData.end()),
+                      [](int x) { return x == 1; });
+      if (!isLaneDataContiguous)
+        return emitError() << "With subgroup_block_io, accessed data must be "
+                              "contiguous and coalesced.";
+      for (size_t i = 0; i < laneData.size(); ++i) {
+        if (laneLayout[i] != blockShape[i])
+          return emitError() << "With subgroup_block_io, the block shape must "
+                                "match the lane layout.";
+        if (laneLayout[i] != 1 && strides[i] != 1)
+          return emitError() << "With subgroup_block_io, the distributed "
+                                "dimensions must be contiguous.";
+      }
+    }
+  }
   if (dataShape.size() == 2) {
-    if (subgroup_block_io)
-      return emitError() << "subgroup_block_io "
-                            "are only allowed when result is a 1D VectorType.";
     if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape),
                      [](auto p) { return std::get<0>(p) > std::get<1>(p); }))
       return emitError() << "data shape must not exceed mem_desc shape.";
   } else {
-    SmallVector<int64_t> blockShape = mdescTy.getBlockShape();
     // if the subgroup_block_io attribute is set,  mdescTy must have block
     // attribute
     if (subgroup_block_io && !blockShape.size())
@@ -258,8 +280,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
     auto [memrefStrides, _] = memrefTy.getStridesAndOffset();
 
     // if shape and strides are from Memref, we don't need attributes for them
-    // to keep the IR print clean.
-    if (staticShape == memrefShape && staticStrides == memrefStrides) {
+    // to keep the IR print clean (only do so for full-static case, otherwise
+    // printer would fail trying to print empty array-attr).
+    if (staticShape == memrefShape && staticStrides == memrefStrides &&
+        dynamicShape.empty() && dynamicStrides.empty()) {
       staticShapeAttr = DenseI64ArrayAttr();
       staticStridesAttr = DenseI64ArrayAttr();
     }
@@ -320,8 +344,10 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state,
     auto [memrefStrides, _] = memrefTy.getStridesAndOffset();
 
     // if shape and strides are from Memref, we don't need attributes for them
-    // to keep the IR print clean.
-    if (staticShape == memrefShape && staticStrides == memrefStrides) {
+    // to keep the IR print clean (only do so for full-static case, otherwise
+    // printer would fail trying to print empty array-attr).
+    if (staticShape == memrefShape && staticStrides == memrefStrides &&
+        dynamicShape.empty() && dynamicStrides.empty()) {
       staticShapeAttr = DenseI64ArrayAttr();
       staticStridesAttr = DenseI64ArrayAttr();
     }
@@ -472,11 +498,8 @@ LogicalResult PrefetchNdOp::verify() {
     return emitOpError("invalid l3_hint: ") << getL3HintAttr();
 
   int64_t tDescRank = tdescTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
@@ -597,11 +620,8 @@ LogicalResult LoadNdOp::verify() {
                          << tdescTy;
 
   int64_t tDescRank = tdescTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
@@ -691,11 +711,8 @@ LogicalResult StoreNdOp::verify() {
                          << dstTy;
 
   int64_t tDescRank = dstTy.getRank();
-  int64_t offsetSize = static_cast<int64_t>(getOffsets().size());
-  int64_t constOffsetSize =
-      getConstOffsetsAttr() ? getConstOffsetsAttr().size() : 0;
-  if (((offsetSize != 0) && (offsetSize != tDescRank)) ||
-      ((constOffsetSize != 0) && (constOffsetSize != tDescRank)))
+  int64_t offsetSize = getMixedOffsets().size();
+  if (offsetSize != 0 && offsetSize != tDescRank)
     return emitOpError(
         "Mismatched ranks between offsets and tensor descriptor");
 
@@ -859,7 +876,7 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
                          xegpu::CachePolicyAttr l2_hint,
                          xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, valueType, source, Value(), mask, IntegerAttr(),
-        l1_hint, l2_hint, l3_hint);
+        l1_hint, l2_hint, l3_hint, /*layout=*/nullptr);
 }
 
 void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
@@ -875,7 +892,24 @@ void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
   auto offset = vector::FromElementsOp::create(builder, loc, type, values);
 
   build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
-        l2_hint, l3_hint);
+        l2_hint, l3_hint, /*layout=*/nullptr);
+}
+
+void LoadGatherOp::build(OpBuilder &builder, OperationState &state,
+                         Type valueType, Value source,
+                         ArrayRef<OpFoldResult> offsets, Value mask,
+                         IntegerAttr chunk_size, xegpu::CachePolicyAttr l1_hint,
+                         xegpu::CachePolicyAttr l2_hint,
+                         xegpu::CachePolicyAttr l3_hint,
+                         xegpu::LayoutAttr layout) {
+  auto loc = source.getLoc();
+  int64_t size = static_cast<int64_t>(offsets.size());
+  auto type = VectorType::get(size, builder.getIndexType());
+  auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
+  auto offset = vector::FromElementsOp::create(builder, loc, type, values);
+
+  build(builder, state, valueType, source, offset, mask, chunk_size, l1_hint,
+        l2_hint, l3_hint, layout);
 }
 
 //===----------------------------------------------------------------------===//
@@ -926,7 +960,7 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
                            xegpu::CachePolicyAttr l2_hint,
                            xegpu::CachePolicyAttr l3_hint) {
   build(builder, state, value, dest, Value(), mask, IntegerAttr(), l1_hint,
-        l2_hint, l3_hint);
+        l2_hint, l3_hint, /*layout=*/nullptr);
 }
 
 void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
@@ -944,7 +978,23 @@ void StoreScatterOp::build(OpBuilder &builder, OperationState &state,
 
   // Call the correct builder overload that does not expect result types.
   build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
-        l3_hint);
+        l3_hint, /*layout=*/nullptr);
+}
+
+void StoreScatterOp::build(
+    OpBuilder &builder, OperationState &state, Value value, Value dest,
+    ArrayRef<OpFoldResult> offsets, Value mask, IntegerAttr chunk_size,
+    xegpu::CachePolicyAttr l1_hint, xegpu::CachePolicyAttr l2_hint,
+    xegpu::CachePolicyAttr l3_hint, xegpu::LayoutAttr layout) {
+  auto loc = dest.getLoc();
+  int64_t size = static_cast<int64_t>(offsets.size());
+  auto type = VectorType::get(size, builder.getIndexType());
+  auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
+  auto offset = vector::FromElementsOp::create(builder, loc, type, values);
+
+  // Call the correct builder overload that does not expect result types.
+  build(builder, state, value, dest, offset, mask, chunk_size, l1_hint, l2_hint,
+        l3_hint, layout);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1105,7 +1155,7 @@ LogicalResult LoadMatrixOp::verify() {
   MemDescType mdescTy = getMemDesc().getType();
 
   return IsValidMatrixOpParams(resTy, mdescTy, subgroup_block_io,
-                               [&]() { return emitError(); });
+                               getLayoutAttr(), [&]() { return emitError(); });
 }
 
 //===----------------------------------------------------------------------===//
@@ -1129,7 +1179,7 @@ LogicalResult StoreMatrixOp::verify() {
   UnitAttr subgroup_block_io = getSubgroupBlockIoAttr();
   MemDescType mdescTy = getMemDesc().getType();
   return IsValidMatrixOpParams(dataTy, mdescTy, subgroup_block_io,
-                               [&]() { return emitError(); });
+                               getLayoutAttr(), [&]() { return emitError(); });
 }
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
index e6f76067094ce..29b645feab2c6 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms
   XeGPUWgToSgDistribute.cpp
   XeGPUPropagateLayout.cpp
   XeGPUVectorLinearize.cpp
+  XeGPUOptimizeBlockLoads.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
new file mode 100644
index 0000000000000..4dc5ea4f7bb24
--- /dev/null
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUOptimizeBlockLoads.cpp
@@ -0,0 +1,490 @@
+//===- XeGPUOptimizeBlockLoads.cpp - XeGPU optimize block loads -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Dialect/XeGPU/IR/XeGPU.h"
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h"
+#include "mlir/Dialect/XeGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h"
+#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h"
+#include "mlir/Dialect/XeGPU/uArch/uArchBase.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include <optional>
+
+namespace mlir {
+namespace xegpu {
+#define GEN_PASS_DEF_XEGPUOPTIMIZEBLOCKLOADS
+#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc"
+} // namespace xegpu
+} // namespace mlir
+
+#define DEBUG_TYPE "xegpu-optimize-block-loads"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+
+using namespace mlir;
+
+namespace {
+
+/// Get the 2D lane data from a tensor desc type if it exists.
+static std::optional<SmallVector<int64_t>>
+getMaybeLaneData(xegpu::TensorDescType tdescType) {
+  auto layout = tdescType.getLayoutAttr();
+  if (!layout)
+    return std::nullopt;
+  auto laneData = layout.getEffectiveLaneDataAsInt();
+  if (laneData.size() != 2)
+    return std::nullopt;
+  return laneData;
+}
+
+/// Get the 2D lane layout from a tensor desc type if it exists.
+static std::optional<SmallVector<int64_t>>
+getMaybeLaneLayout(xegpu::TensorDescType tdescType) {
+  auto layout = tdescType.getLayoutAttr();
+  if (!layout)
+    return std::nullopt;
+  auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+  if (laneLayout.size() != 2)
+    return std::nullopt;
+  return laneLayout;
+}
+
+/// A layout can be optimized if its lane layout is transposed (lane[0] != 1 &&
+/// lane[1] == 1), but inner lane data is not equal to [1, 1].
+/// Example:
+///     !xegpu.tensor_desc<16x16xf16,
+///         #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>>
+/// In this case, lane layout is transposed (from the usual [1, SG_SIZE] form)
+/// indicating that this is a load that requires transpose effect. However,
+/// lane data is [1, 2], meaning that each lane must grab 2 f16 elements from
+/// the inner dimension. We convert this to a optimized form by converting the
+/// tensor_desc to i32 type such that lane data becomes [1, 1]. This makes the
+/// later lowering easily use the load with transpose instruction.
+static bool canBeOptimizedForTranspose(ArrayRef<int64_t> laneLayout,
+                                       ArrayRef<int64_t> laneData) {
+  if (laneLayout.size() != 2 || laneData.size() != 2)
+    return false;
+  if (laneLayout[0] == 1 || laneLayout[1] != 1)
+    return false;
+  if (laneData[0] != 1 || laneData[1] == 1)
+    return false;
+  return true;
+}
+
+/// A tensor desc type can be optimized if its element type is less than 32 bits
+/// and its layout can be optimized.
+static bool canBeOptimizedForTranspose(xegpu::TensorDescType tdescType) {
+  // If the dtype is greater or equal to 32 bits, layout must be valid.
+  int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
+  if (elementTyBitwidth >= 32)
+    return false;
+  auto maybeLaneLayout = getMaybeLaneLayout(tdescType);
+  auto maybeLaneData = getMaybeLaneData(tdescType);
+  if (!maybeLaneData || !maybeLaneLayout)
+    return false;
+  return canBeOptimizedForTranspose(*maybeLaneLayout, *maybeLaneData);
+}
+
+/// Check if a tensor desc type can be optimized for transpose, if so return the
+/// new optimized tensor desc type with a valid transpose layout.
+static xegpu::TensorDescType tryOptimize(xegpu::TensorDescType tdescType,
+                                         const uArch *targetuArch) {
+  if (!canBeOptimizedForTranspose(tdescType))
+    return tdescType;
+  auto laneData = getMaybeLaneData(tdescType)
+                      .value(); // Lane data must exist if we reach here.
+  int64_t innerLaneData = laneData[1];
+  int elementTyBitwidth = tdescType.getElementType().getIntOrFloatBitWidth();
+  // Required shape is total shape of the vector result that this tensor desc
+  // must eventually load after adjusting for the new bitwidth and array
+  // length.
+  SmallVector<int64_t> requiredShape(tdescType.getShape());
+  requiredShape.back() =
+      requiredShape.back() * tdescType.getArrayLength() / innerLaneData;
+  int newBitWidth = elementTyBitwidth * innerLaneData;
+  Type newElemTy = IntegerType::get(tdescType.getContext(), newBitWidth);
+  // Supported shape is the max transpose shape that can be supported by
+  // hardware that is less than or equal to required shape.
+  auto *blockLoadTarget = dyn_cast<Subgroup2DBlockLoadInstruction>(
+      targetuArch->getInstruction(InstructionKind::Subgroup2DBlockLoad));
+  auto maybeHWParams = blockLoadTarget->getBlockWidthHeightCount(
+      newElemTy, /** has transform */ false, /** has transpose */ true);
+  // If no HW params found, return the original type.
+  if (!maybeHWParams)
+    return tdescType;
+  auto [widths, heights, counts] = maybeHWParams.value();
+  // TODO: Currently we expect array length to be 1 for transpose case.
+  if (counts.size() != 1 || counts[0] != 1)
+    return tdescType;
+  int arrayLen = counts[0];
+  int supportedHeight =
+      xegpu::getLargestDivisor(static_cast<int>(requiredShape[0]), heights);
+  int supportedWidth =
+      xegpu::getLargestDivisor(static_cast<int>(requiredShape[1]), widths);
+  // If no supported height or width found, return the original type.
+  if (supportedHeight == -1 || supportedWidth == -1)
+    return tdescType;
+
+  SmallVector<int64_t> supportedShape = {supportedHeight, supportedWidth};
+  xegpu::LayoutAttr newLayout = xegpu::LayoutAttr::get(
+      tdescType.getContext(),
+      tdescType.getLayoutAttr().getLaneLayout().asArrayRef(), {1, 1});
+  // Array length can not be larger than 1 for transpose case.
+  return xegpu::TensorDescType::get(supportedShape, newElemTy, arrayLen,
+                                    tdescType.getBoundaryCheck(),
+                                    tdescType.getMemorySpace(), newLayout);
+}
+
+/// Helper to convert an OpFoldResult to Value.
+static Value convertToValue(ConversionPatternRewriter &rewriter, Location loc,
+                            OpFoldResult ofr) {
+  std::optional<int64_t> mayBeInt = getConstantIntValue(ofr);
+  if (mayBeInt)
+    return arith::ConstantIndexOp::create(rewriter, loc, *mayBeInt).getResult();
+  return llvm::cast<Value>(ofr);
+}
+
+/// Helper to divide a Value by a constant integer.
+static Value divideByConstant(ConversionPatternRewriter &rewriter, Location loc,
+                              Value val, int64_t constant) {
+  // If the constant is a power of 2, use right shift for division.
+  if (llvm::isPowerOf2_64(constant)) {
+    int64_t shiftAmount = llvm::Log2_64(constant);
+    return arith::ShRUIOp::create(
+               rewriter, loc, val,
+               arith::ConstantIndexOp::create(rewriter, loc, shiftAmount)
+                   .getResult())
+        .getResult();
+  }
+  auto constantOp =
+      arith::ConstantIndexOp::create(rewriter, loc, constant).getResult();
+  return arith::DivUIOp::create(rewriter, loc, val, constantOp).getResult();
+}
+
+/// This function takes a larger register block `data` and generates multiple
+/// smaller loads (size given by `newTensorDesc`) to fill in the `data` block
+/// starting from `offsets`.
+static Value generateLoads(ConversionPatternRewriter &rewriter,
+                           TypedValue<VectorType> data,
+                           SmallVector<OpFoldResult> offsets,
+                           TypedValue<xegpu::TensorDescType> newTensorDesc,
+                           xegpu::LoadNdOp origLoadOp) {
+  Location loc = data.getLoc();
+  assert(offsets.size() >= 2 && "Expecting at least 2 offsets for 2D LoadNdOp");
+  Value offsetDim0 = convertToValue(rewriter, loc, offsets[offsets.size() - 2]);
+  Value offsetDim1 = convertToValue(rewriter, loc, offsets[offsets.size() - 1]);
+  SmallVector<int64_t> supportedShape(newTensorDesc.getType().getShape());
+  // Compute the ratio between original shape and supported shape. We need to
+  // generate loads in this ratio arrangement.
+  auto shapeRatio = computeShapeRatio(data.getType().getShape(),
+                                      supportedShape)
+                        .value(); // `ratio` must be defined if we reach here.
+  for (int64_t h = 0; h < shapeRatio[0]; ++h) {
+    for (int64_t w = 0; w < shapeRatio[1]; ++w) {
+      int64_t localOffsetDim0 = h * supportedShape[0];
+      int64_t localOffsetDim1 = w * supportedShape[1];
+      Value loadOffsetX = arith::AddIOp::create(
+          rewriter, loc, offsetDim0,
+          arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim0)
+              .getResult());
+      Value loadOffsetY = arith::AddIOp::create(
+          rewriter, loc, offsetDim1,
+          arith::ConstantIndexOp::create(rewriter, loc, localOffsetDim1)
+              .getResult());
+      auto loadOp = xegpu::LoadNdOp::create(
+          rewriter, loc,
+          VectorType::get(supportedShape, data.getType().getElementType()),
+          newTensorDesc, ArrayRef<OpFoldResult>{loadOffsetX, loadOffsetY},
+          origLoadOp.getPackedAttr(), origLoadOp.getTransposeAttr(),
+          origLoadOp.getL1HintAttr(), origLoadOp.getL2HintAttr(),
+          origLoadOp.getL3HintAttr());
+      // Set the layout for the loadOp.
+      auto layoutAttr = newTensorDesc.getType().getLayoutAttr();
+      xegpu::setDistributeLayoutAttr(loadOp->getOpResult(0), layoutAttr);
+      // Insert the loaded block into the right position in data.
+      auto insertOp = vector::InsertStridedSliceOp::create(
+          rewriter, loc, loadOp.getResult(), data,
+          ArrayRef<int64_t>{localOffsetDim0, localOffsetDim1},
+          ArrayRef<int64_t>{1, 1});
+      // InsertOp must have the same layout as newTensorDesc.
+      xegpu::setDistributeLayoutAttr(insertOp->getOpResult(0), layoutAttr);
+      data = insertOp.getResult();
+    }
+  }
+  return data;
+}
+
+/// Checks if a CreateNdDescOp can be optimized for transpose, if so creates a
+/// new CreateNdDescOp with optimized tensor desc type. This involves extracting
+/// the base pointer from the original memory source and adjusting the shape and
+/// strides of the tensor desc to fit with the new optimized transpose layout.
+class XeGPUCreateNdDescOpPattern final
+    : public OpConversionPattern<xegpu::CreateNdDescOp> {
+public:
+  using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::CreateNdDescOp createNdOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto tdescTy = createNdOp.getType();
+    // Get the target uArch info.
+    auto chipStr = xegpu::getChipStr(createNdOp);
+    // Check if the chip is supported.
+    assert(
+        chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg") &&
+        "Expecting target chip to be pvc or bmg for transpose optimization.");
+    const uArch *targetuArch = xegpu::uArch::getUArch(chipStr.value());
+
+    auto convertType = tryOptimize(tdescTy, targetuArch);
+    if (convertType == tdescTy)
+      return failure();
+    auto strides = createNdOp.getMixedStrides();
+    auto maybeConstInnerStride = getConstantIntValue(strides.back());
+    // Only row-major memrefs are expected for now.
+    if (!maybeConstInnerStride || *maybeConstInnerStride != 1)
+      return rewriter.notifyMatchFailure(
+          createNdOp, "Expecting row-major memref for transpose optimization.");
+    Value source = createNdOp.getSource();
+    auto optionalLaneData = getMaybeLaneData(tdescTy);
+    assert(optionalLaneData && "Expected 2D lane data");
+    auto laneData = optionalLaneData.value();
+    int64_t innerLaneData = laneData[1];
+    auto memrefType = dyn_cast<MemRefType>(source.getType());
+    // Inner dimension of the shape must be adjusted based on innerLaneData.
+    SmallVector<OpFoldResult> modifiedShape(createNdOp.getMixedSizes());
+    modifiedShape.back() = divideByConstant(
+        rewriter, createNdOp.getLoc(),
+        convertToValue(rewriter, createNdOp.getLoc(), modifiedShape.back()),
+        innerLaneData);
+    // Similarly, second to last stride must be adjusted.
+    assert(strides.size() >= 2 &&
+           "Expected at least 2 strides for CreateNdDescOp");
+    SmallVector<OpFoldResult> modifiedStrides(strides);
+    modifiedStrides[modifiedStrides.size() - 2] = divideByConstant(
+        rewriter, createNdOp.getLoc(),
+        convertToValue(rewriter, createNdOp.getLoc(),
+                       modifiedStrides[modifiedStrides.size() - 2]),
+        innerLaneData);
+
+    // If the source is a static memref, we need to extract the pointer to
+    // base address.
+    if (memrefType && memrefType.hasStaticShape()) {
+      auto extractOp = memref::ExtractAlignedPointerAsIndexOp::create(
+          rewriter, createNdOp.getLoc(), source);
+      source = arith::IndexCastOp::create(rewriter, createNdOp.getLoc(),
+                                          rewriter.getI64Type(),
+                                          extractOp.getResult())
+                   .getResult();
+    }
+    // Create a new CreateNdDescOp with the modified shape and converted type.
+    auto newCreateNdDescOp = xegpu::CreateNdDescOp::create(
+        rewriter, createNdOp.getLoc(), convertType, source, modifiedShape,
+        modifiedStrides);
+    rewriter.replaceOp(createNdOp, newCreateNdDescOp.getResult());
+    return success();
+  }
+};
+
+/// Checks if a LoadNdOp consumes a tensor desc type that was rewritten for
+/// tranpose optimization. If so, rewrites the LoadNdOp to to align with the
+/// adjusted tensor desc type. This can result in multiple LoadNdOps being
+/// generated to fill in the original load shape.
+class XeGPULoadNdDescOpPattern final
+    : public OpConversionPattern<xegpu::LoadNdOp> {
+public:
+  using OpConversionPattern<xegpu::LoadNdOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(xegpu::LoadNdOp loadNdOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto origTensorDescType = loadNdOp.getTensorDescType();
+    auto adaptorType =
+        cast<xegpu::TensorDescType>(adaptor.getTensorDesc().getType());
+    if (adaptorType == origTensorDescType)
+      return failure();
+    // Offsets must be adjusted based on innerLaneData.
+    auto laneData = getMaybeLaneData(loadNdOp.getTensorDescType()).value();
+    int64_t innerLaneData = laneData[1];
+    auto offsets = loadNdOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(loadNdOp,
+                                         "Expecting offsets in LoadNd");
+    SmallVector<OpFoldResult> modifiedOffsets(offsets);
+    modifiedOffsets.back() = divideByConstant(
+        rewriter, loadNdOp.getLoc(),
+        convertToValue(rewriter, loadNdOp.getLoc(), modifiedOffsets.back()),
+        innerLaneData);
+    // Get the 2D data shape of this loadNdOp in its original type including
+    // array length.
+    SmallVector<int64_t> origDataShape(origTensorDescType.getShape());
+    // Adjust the data shape based on innerLaneData.
+    origDataShape.back() /= innerLaneData;
+    // HW supported shape is the new tensor desc shape after conversion.
+    SmallVector<int64_t> hwSupportedShape(adaptorType.getShape());
+    VectorType origVectorType =
+        VectorType::get(origDataShape, adaptorType.getElementType());
+    Value data;
+    // Orig data shape is 3D for the array length case.
+    if (origTensorDescType.getArrayLength() > 1) {
+      SmallVector<Value> arraySlices;
+      for (int64_t i = 0; i < origTensorDescType.getArrayLength(); ++i) {
+        Value slice = arith::ConstantOp::create(
+            rewriter, loadNdOp->getLoc(), origVectorType,
+            rewriter.getZeroAttr(origVectorType));
+        // Increase the Y offset for each array slice.
+        Value offsetY = convertToValue(rewriter, loadNdOp->getLoc(),
+                                       modifiedOffsets.back());
+        modifiedOffsets.back() =
+            arith::AddIOp::create(
+                rewriter, loadNdOp->getLoc(), offsetY,
+                arith::ConstantIndexOp::create(rewriter, loadNdOp->getLoc(),
+                                               i * origDataShape[1])
+                    .getResult())
+                .getResult();
+        slice = generateLoads(
+            rewriter, cast<TypedValue<VectorType>>(slice), modifiedOffsets,
+            cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
+            loadNdOp);
+        // BitCast back to original load shape without array length.
+        auto bitcastType = VectorType::get(origTensorDescType.getShape(),
+                                           origTensorDescType.getElementType());
+        auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
+                                                   bitcastType, slice);
+        // BitCastOp must have the same layout as the original loadNdOp.
+        xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
+                                       origTensorDescType.getLayoutAttr());
+        arraySlices.push_back(bitCastOp.getResult());
+      }
+      rewriter.replaceOpWithMultiple(loadNdOp, {arraySlices});
+      return success();
+    }
+    data = arith::ConstantOp::create(
+        rewriter, loadNdOp->getLoc(),
+        VectorType::get(origDataShape, adaptorType.getElementType()),
+        rewriter.getZeroAttr(origVectorType));
+    data = generateLoads(
+        rewriter, cast<TypedValue<VectorType>>(data), modifiedOffsets,
+        cast<TypedValue<xegpu::TensorDescType>>(adaptor.getTensorDesc()),
+        loadNdOp);
+    auto bitCastOp = vector::BitCastOp::create(rewriter, loadNdOp->getLoc(),
+                                               loadNdOp.getType(), data);
+    // BitCastOp must have the same layout as the original loadNdOp.
+    xegpu::setDistributeLayoutAttr(bitCastOp->getOpResult(0),
+                                   origTensorDescType.getLayoutAttr());
+    rewriter.replaceOp(loadNdOp, bitCastOp);
+    return success();
+  }
+};
+
+/// Vector ExtractOp must be processed if the original tensor desc type has
+/// array length greater than 1. In this case, the LoadNdOp is replaced with
+/// multiple LoadNdOps for each array slice making the extraction unnecessary.
+/// In this case, we simply remove the ExtractOp.
+class VectorExtractOpPattern final
+    : public OpConversionPattern<vector::ExtractOp> {
+public:
+  using OpConversionPattern<vector::ExtractOp>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(vector::ExtractOp extractOp, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Check if the source of the extraction is split to multiple values.
+    if (adaptor.getSource().size() == 1)
+      return failure();
+    auto mixedPos = extractOp.getMixedPosition();
+    if (mixedPos.size() != 1)
+      return failure();
+    auto mayBeInt = getConstantIntValue(mixedPos[0]);
+    if (!mayBeInt)
+      return failure();
+    rewriter.replaceOp(extractOp, adaptor.getSource()[*mayBeInt]);
+    return success();
+  }
+};
+
+} // namespace
+
+void xegpu::populateXeGPUOptimizeBlockLoadsPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<XeGPUCreateNdDescOpPattern, XeGPULoadNdDescOpPattern,
+               VectorExtractOpPattern>(patterns.getContext());
+}
+
+namespace {
+
+struct XeGPUOptimizeBlockLoadsPass final
+    : public xegpu::impl::XeGPUOptimizeBlockLoadsBase<
+          XeGPUOptimizeBlockLoadsPass> {
+  void runOnOperation() override {
+    MLIRContext &context = getContext();
+    TypeConverter converter;
+    RewritePatternSet patterns(&context);
+    ConversionTarget target(context);
+
+    // This pass is only meant for PVC and BMG targets. If unsupported target
+    // is found, exit early.
+    bool isTargetSupported = false;
+    getOperation()->walk([&](gpu::GPUFuncOp funcOp) {
+      auto chipStr = xegpu::getChipStr(funcOp);
+      if (chipStr && (chipStr.value() == "pvc" || chipStr.value() == "bmg"))
+        isTargetSupported = true;
+    });
+
+    if (!isTargetSupported) {
+      DBGS() << "XeGPUOptimizeBlockLoadsPass only supports PVC and BMG targets."
+             << "\n";
+      return;
+    }
+
+    // CreateNdDescOp and LoadNdOp with optimizable tensor desc types must be
+    // converted.
+    target.addDynamicallyLegalOp<xegpu::CreateNdDescOp>(
+        [&](xegpu::CreateNdDescOp createNdOp) {
+          return !canBeOptimizedForTranspose(createNdOp.getType());
+        });
+    target.addDynamicallyLegalOp<xegpu::LoadNdOp>(
+        [&](xegpu::LoadNdOp loadNdOp) {
+          return !canBeOptimizedForTranspose(loadNdOp.getTensorDescType());
+        });
+    // Vector ExtractOps can have optimizable layouts if they extract from
+    // LoadNdOps with array length greater than 1. These ExtractOps must be
+    // converted.
+    target.addDynamicallyLegalOp<vector::ExtractOp>(
+        [&](vector::ExtractOp extractOp) {
+          auto layout = xegpu::getDistributeLayoutAttr(extractOp.getResult());
+          if (!layout)
+            return true;
+          auto laneLayout = layout.getEffectiveLaneLayoutAsInt();
+          auto laneData = layout.getEffectiveLaneDataAsInt();
+          return !canBeOptimizedForTranspose(laneLayout, laneData);
+        });
+    converter.addConversion([](Type type) { return type; });
+
+    target.addLegalDialect<arith::ArithDialect, memref::MemRefDialect,
+                           vector::VectorDialect>();
+    scf::populateSCFStructuralTypeConversionsAndLegality(converter, patterns,
+                                                         target);
+    xegpu::populateXeGPUOptimizeBlockLoadsPatterns(patterns);
+    if (failed(applyPartialConversion(getOperation(), target,
+                                      std::move(patterns)))) {
+      DBGS() << "Optimize block loads pass failed.\n";
+      return signalPassFailure();
+    }
+  }
+};
+
+} // namespace
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 90eae871a5ef3..4e1a539771d2f 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -204,28 +204,6 @@ struct LayoutInfoLattice : public Lattice<LayoutInfo> {
   using Lattice::Lattice;
 };
 
-/// Helper Function to find a proper instruction multiple for the user-supplied
-/// sg-level data shape. `candidates` are uArch allowed shapes.
-/// `candidateMultiples` are uArch multiples of such shapes (e.g., block count).
-template <typename T>
-int getLargestDivisor(T dim, ArrayRef<T> candidates,
-                      ArrayRef<T> candidateMultiples = {}) {
-  static_assert(std::is_integral<T>::value, "T must be an integer type");
-  int largest = -1;
-  SmallVector<T> multiples = {1};
-  if (!candidateMultiples.empty())
-    multiples =
-        SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
-  for (T candidate : candidates) {
-    for (T multiple : multiples) {
-      int value = static_cast<int>(candidate * multiple);
-      if (value != 0 && dim % value == 0 && value > largest)
-        largest = value;
-    }
-  }
-  return largest;
-}
-
 /// Helper Functions to get default layouts. A `default layout` is a layout that
 /// is assigned to a value when the layout is not fixed by some anchor operation
 /// (like DPAS).
@@ -505,7 +483,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
     prefetch.emitWarning("No known block params found for the element type.");
   auto [bWidth, bHeight, bCount] = blockWHC.value();
   SmallVector<int> instData;
-  int instWidth = getLargestDivisor(
+  int instWidth = xegpu::getLargestDivisor(
       static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 1)), bWidth,
       bCount);
   if (instWidth == -1)
@@ -514,7 +492,7 @@ void LayoutInfoPropagation::visitPrefetchNdOp(
   if (tdescTy.getRank() == 1)
     instData = {instWidth};
   else {
-    int instHeight = getLargestDivisor(
+    int instHeight = xegpu::getLargestDivisor(
         static_cast<int>(tdescTy.getDimSize(tdescTy.getRank() - 2)), bHeight);
     if (instHeight == -1)
       prefetch.emitWarning(
@@ -634,7 +612,7 @@ void LayoutInfoPropagation::visitDpasOp(
   const unsigned dataALen = aTy.getShape().front();
   auto supportedALen = uArchInstruction->getSupportedM(aTy.getElementType());
   const int maxALen =
-      getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
+      xegpu::getLargestDivisor(dataALen, ArrayRef<unsigned>(supportedALen));
   if (maxALen == -1)
     dpas.emitWarning(
         "No suitable instruction multiple found for the given shape.");
@@ -642,7 +620,7 @@ void LayoutInfoPropagation::visitDpasOp(
   const unsigned dataBLen = bTy.getShape().back();
   auto supportedBLen = uArchInstruction->getSupportedK(bTy.getElementType());
   const int maxBLen =
-      getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
+      xegpu::getLargestDivisor(dataBLen, ArrayRef<unsigned>(supportedBLen));
   if (maxBLen == -1)
     dpas.emitWarning(
         "No suitable instruction multiple found for the given shape.");
@@ -662,7 +640,7 @@ void LayoutInfoPropagation::visitDpasOp(
     const unsigned dataCLen = bTy.getShape().back();
     auto supportedCLen = uArchInstruction->getSupportedN(bTy.getElementType());
     const int maxCLen =
-        getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
+        xegpu::getLargestDivisor(dataCLen, ArrayRef<unsigned>(supportedCLen));
     if (maxCLen == -1)
       dpas.emitWarning(
           "No suitable instruction multiple found for the given shape.");
@@ -691,7 +669,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
     store.emitWarning("No known block params found for the element type.");
   auto [bWidth, bHeight, bCount] = blockWHC.value();
   SmallVector<int> instData;
-  int instWidth = getLargestDivisor(
+  int instWidth = xegpu::getLargestDivisor(
       static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 1)), bWidth,
       bCount);
   if (instWidth == -1)
@@ -700,7 +678,7 @@ void LayoutInfoPropagation::visitStoreNdOp(
   if (dataTy.getRank() == 1)
     instData = {instWidth};
   else {
-    int instHeight = getLargestDivisor(
+    int instHeight = xegpu::getLargestDivisor(
         static_cast<int>(dataTy.getDimSize(dataTy.getRank() - 2)), bHeight);
     if (instHeight == -1)
       store.emitWarning(
@@ -904,9 +882,16 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     if (dstTdescTy.getChunkSizeAsInt() > 1)
       instData.push_back(chunkSize);
   }
-  LayoutInfo payloadLayout = getDefaultSIMTLayoutInfo(
-      payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
-      /*scattered=*/true);
+
+  LayoutInfo payloadLayout;
+
+  if (auto layout = storeScatter.getLayoutAttr()) {
+    payloadLayout = LayoutInfo(layout);
+  } else {
+    payloadLayout = getDefaultSIMTLayoutInfo(
+        payloadTy, uArch, instData, uArch->getGeneralPackedFormatBitSize(),
+        /*scattered=*/true);
+  }
 
   LayoutInfo maskLayout =
       getDefaultSIMTLayoutInfo(storeScatter->getContext(), 1, subgroupSize);
@@ -1041,7 +1026,7 @@ static LogicalResult updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
     }
     // If the result is a vector type, add a temporary layout attribute to the
     // op.
-    xegpu::setDistributeLayoutAttr(result, layout);
+    xegpu::setDistributeLayoutAttr(result, layout, /*respectPermLayout*/ true);
   }
   return success();
 }
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 5a3b27ec6108e..bbd7733e89c29 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
+#include "mlir/Dialect/Index/IR/IndexDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
@@ -912,6 +913,186 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
+static SmallVector<Value> computeDistributedCoordinatesForMatrixOp(
+    PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
+    Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
+  SmallVector<Value> newCoods;
+  auto maybeCoords =
+      layout.computeDistributedCoords(rewriter, loc, laneId, payloadShape);
+  if (failed(maybeCoords))
+    return {};
+  assert(maybeCoords.value().size() == 1 &&
+         "Expected one set of distributed offsets");
+  SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
+      rewriter, loc, getAsOpFoldResult(maybeCoords.value()[0]),
+      getAsOpFoldResult(origOffsets));
+  newCoods = llvm::to_vector(llvm::map_range(
+      ofrVec, [&](OpFoldResult ofr) -> Value { return cast<Value>(ofr); }));
+  return newCoods;
+}
+
+/// Pattern for distributing xegpu::LoadMatrixOp.
+struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    gpu::YieldOp yield = warpOp.getTerminator();
+    Operation *lastNode = yield->getPrevNode();
+    auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
+    if (!matrixOp)
+      return failure();
+
+    OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
+      return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
+    });
+    if (!producedByLastLoad)
+      return rewriter.notifyMatchFailure(
+          warpOp, "The last op is not xegpu::LoadMatrixOp");
+    const int operandIdx = producedByLastLoad->getOperandNumber();
+
+    VectorType sgPayloadTy =
+        dyn_cast<VectorType>(matrixOp.getResult().getType());
+    VectorType warpResultTy =
+        cast<VectorType>(warpOp.getResult(operandIdx).getType());
+    if (!sgPayloadTy)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix op payload must be a vector type");
+
+    auto loc = matrixOp.getLoc();
+    auto offsets = matrixOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(matrixOp,
+                                         "the load op must have offsets");
+    SmallVector<Value> offsetsAsValues =
+        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
+
+    auto layout = matrixOp.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix operation lacks layout attribute");
+
+    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+    if (failed(distPayloadByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          matrixOp, "Failed to distribute matrix op payload based on layout.");
+
+    SmallVector<Value> operands = {matrixOp.getMemDesc()};
+    const unsigned offsetsStartIdx = operands.size();
+    operands.append(offsetsAsValues);
+
+    SmallVector<Type> operandTypes = llvm::to_vector(
+        llvm::map_range(operands, [](Value v) { return v.getType(); }));
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, operands, operandTypes, newRetIndices);
+    SmallVector<Value> newOperands = llvm::map_to_vector(
+        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+
+    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
+    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
+              ShapedType::kDynamic);
+    DenseI64ArrayAttr newConstOffsetsAttr =
+        rewriter.getDenseI64ArrayAttr(newConstOffsets);
+    ValueRange currentOffsets =
+        ValueRange(newOperands).drop_front(offsetsStartIdx);
+
+    SmallVector<Value> newCoords = currentOffsets;
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    if (!matrixOp.getSubgroupBlockIoAttr()) {
+      newCoords = computeDistributedCoordinatesForMatrixOp(
+          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
+          currentOffsets);
+    }
+    xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
+        rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
+        newOperands[0], ValueRange(newCoords), newConstOffsetsAttr,
+        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+    // Resolve the output type and replace all uses.
+    rewriter.replaceAllUsesWith(
+        newWarpOp.getResult(operandIdx),
+        resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
+    return success();
+  }
+};
+
+/// Pattern for distributing xegpu::StoreMatrixOp.
+struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    gpu::YieldOp yield = warpOp.getTerminator();
+    Operation *lastNode = yield->getPrevNode();
+    auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
+    if (!matrixOp)
+      return failure();
+
+    VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
+    if (!sgPayloadTy)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix op payload must be a vector type");
+
+    auto loc = matrixOp.getLoc();
+    auto offsets = matrixOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(matrixOp,
+                                         "the store op must have offsets");
+    SmallVector<Value> offsetsAsValues =
+        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
+
+    auto layout = matrixOp.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix operation lacks layout attribute");
+
+    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+    if (failed(distPayloadByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          matrixOp, "Failed to distribute matrix op payload based on layout.");
+
+    SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
+    const unsigned offsetsStartIdx = operands.size();
+    operands.append(offsetsAsValues);
+
+    SmallVector<Type> operandTypes = llvm::to_vector(
+        llvm::map_range(operands, [](Value v) { return v.getType(); }));
+    operandTypes[0] = *distPayloadByWarpOpOrFailure;
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, operands, operandTypes, newRetIndices);
+    SmallVector<Value> newOperands = llvm::map_to_vector(
+        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+
+    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
+    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
+              ShapedType::kDynamic);
+    DenseI64ArrayAttr newConstOffsetsAttr =
+        rewriter.getDenseI64ArrayAttr(newConstOffsets);
+    ValueRange currentOffsets =
+        ValueRange(newOperands).drop_front(offsetsStartIdx);
+
+    SmallVector<Value> newCoords = currentOffsets;
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    if (!matrixOp.getSubgroupBlockIoAttr()) {
+      newCoords = computeDistributedCoordinatesForMatrixOp(
+          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
+          currentOffsets);
+    }
+
+    xegpu::StoreMatrixOp::create(
+        rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
+        ValueRange(newCoords), newConstOffsetsAttr,
+        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+    rewriter.eraseOp(matrixOp);
+    return success();
+  }
+};
+
 /// Distribute a scattered load op. The logic and requirements are the same as
 /// for the scattered store distribution. The warpOp's payload vector is
 /// expected to be distributed by the load's result consumer.
@@ -1443,7 +1624,8 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
                LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
                GpuBarrierDistribution, VectorMultiReductionDistribution,
                LoadDistribution, StoreDistribution, VectorTransposeDistribution,
-               VectorBitcastDistribution,
+               VectorBitcastDistribution, LoadMatrixDistribution,
+               StoreMatrixDistribution,
                MemrefExtractAlignedPointerAsIndexDistribution>(
       patterns.getContext(),
       /*pattern benefit=*/regularPatternBenefit);
@@ -1468,6 +1650,8 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       // Layouts are needed for vector type only.
       if (!isa<VectorType>(operand.get().getType()))
         continue;
+      if (isa<xegpu::LoadMatrixOp, xegpu::StoreMatrixOp>(op))
+        continue;
 
       auto layout = xegpu::getDistributeLayoutAttr(operand.get());
       if (!layout) {
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index e6e71cc29a80a..c3bf9606693a8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -678,12 +678,16 @@ struct UnrollLoadGatherOpWithOffset
           pack(offsets, convertedOffsetTypes, *targetShape, loc, rewriter);
     }
 
+    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    if (layout)
+      layout = layout.dropInstData();
+
     SmallVector<Value> newOps;
     for (auto [o, m] : llvm::zip(convertedOffsets, convertedMasks)) {
       auto newOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newValueTy, op.getSource(), o, m,
           rewriter.getI64IntegerAttr(chunkSize), op.getL1HintAttr(),
-          op.getL2HintAttr(), op.getL3HintAttr());
+          op.getL2HintAttr(), op.getL3HintAttr(), layout);
       newOps.push_back(newOp);
     }
 
@@ -774,12 +778,16 @@ struct UnrollStoreScatterOpWithOffsets
     SmallVector<Value> convertedValues =
         pack(op.getValue(), convertedValTypes, *targetShape, loc, rewriter);
 
+    auto layout = dyn_cast_if_present<xegpu::LayoutAttr>(op.getLayoutAttr());
+    if (layout)
+      layout = layout.dropInstData();
+
     for (auto [v, o, m] :
          llvm::zip(convertedValues, convertedOffsets, convertedMasks)) {
       xegpu::StoreScatterOp::create(rewriter, loc, v, op.getDest(), o, m,
                                     rewriter.getI64IntegerAttr(chunkSize),
                                     op.getL1HintAttr(), op.getL2HintAttr(),
-                                    op.getL3HintAttr());
+                                    op.getL3HintAttr(), layout);
     }
 
     rewriter.eraseOp(op);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 9fc5ad9af5c7b..0a9ef0aa6df96 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -114,7 +114,8 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
   // Compute the list of subgroup-relative offsets for sub-tensors or sub-memory
   // descriptors to be accessed, based on the layout information.
   ArrayRef<int64_t> wgShape = op.getDataShape();
-  auto maybeDescOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+  auto maybeDescOffsets =
+      layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
   if (failed(maybeDescOffsets))
     return failure();
 
@@ -830,8 +831,8 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
       // Get subgroup id
       Value sgId =
           gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-
-      auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+      auto sgOffsets =
+          layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
       if (failed(sgOffsets))
         return failure();
 
@@ -888,8 +889,8 @@ struct WgToSgLoadGatherOpWithOffset
       return failure();
     ArrayRef<int64_t> wgShape = resultType.getShape();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getResult());
+    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
+        xegpu::getDistributeLayoutAttr(op.getResult()));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -914,9 +915,8 @@ struct WgToSgLoadGatherOpWithOffset
          llvm::zip(adaptor.getOffsets(), adaptor.getMask())) {
       auto newLoadOp = xegpu::LoadGatherOp::create(
           rewriter, loc, newTy, op.getSource(), offsets, mask, chunkSizeAttr,
-          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
-      xegpu::setDistributeLayoutAttr(newLoadOp->getResult(0),
-                                     layout.dropSgLayoutAndData());
+          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
+          layout.dropSgLayoutAndData());
       newLoadOps.push_back(newLoadOp);
     }
     rewriter.replaceOpWithMultiple(op, {newLoadOps});
@@ -941,8 +941,8 @@ struct WgToSgStoreScatterOpWithOffset
     if (!valueType)
       return failure();
 
-    xegpu::DistributeLayoutAttr layout =
-        xegpu::getDistributeLayoutAttr(op.getOperand(0));
+    xegpu::LayoutAttr layout = dyn_cast_if_present<xegpu::LayoutAttr>(
+        xegpu::getDistributeLayoutAttr(op.getOperand(0)));
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
@@ -964,7 +964,8 @@ struct WgToSgStoreScatterOpWithOffset
              adaptor.getValue(), adaptor.getOffsets(), adaptor.getMask())) {
       auto store = xegpu::StoreScatterOp::create(
           rewriter, loc, val, op.getDest(), offs, mask, chunkSizeAttr,
-          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr());
+          op.getL1HintAttr(), op.getL2HintAttr(), op.getL3HintAttr(),
+          layout.dropSgLayoutAndData());
       // Update the layout attribute to drop sg_layout and sg_data.
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty()) {
@@ -1052,7 +1053,8 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
 
     Value sgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-    auto sgOffsets = layout.getOffsets(rewriter, loc, sgId, wgShape);
+    auto sgOffsets =
+        layout.computeDistributedCoords(rewriter, loc, sgId, wgShape);
     if (failed(sgOffsets))
       return failure();
 
@@ -1217,6 +1219,70 @@ struct WgToSgMultiDimReductionOp
   }
 };
 
+// This pattern transforms vector.transpose ops to work at subgroup level.
+struct WgToSgVectorTransposeOp
+    : public OpConversionPattern<vector::TransposeOp> {
+  using OpConversionPattern<vector::TransposeOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::TransposeOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    VectorType resultType = op.getResultVectorType();
+
+    ArrayRef<int64_t> wgShape = resultType.getShape();
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getResult());
+    if (!layout || !layout.isForWorkgroup())
+      return failure();
+
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(op.getVector());
+    if (!sourceLayout || !sourceLayout.isForWorkgroup())
+      return failure();
+
+    SmallVector<int64_t> sourceSgLayout =
+        sourceLayout.getEffectiveSgLayoutAsInt();
+    SmallVector<int64_t> resultSgLayout = layout.getEffectiveSgLayoutAsInt();
+    DenseI32ArrayAttr sourceOrder = sourceLayout.getOrder();
+    DenseI32ArrayAttr resultOrder = layout.getOrder();
+
+    if (!sourceOrder || !resultOrder) {
+      return rewriter.notifyMatchFailure(
+          op, "Both source and result must have order attributes");
+    }
+
+    ArrayRef<int64_t> permutation = op.getPermutation();
+    size_t permutationSize = permutation.size();
+    if (sourceSgLayout.size() != permutationSize ||
+        resultSgLayout.size() != permutationSize) {
+      return rewriter.notifyMatchFailure(
+          op, "Layouts and permutation must have the same rank");
+    }
+
+    // Check that sgLayout, sgData & order are properly transposed for source
+    // and result
+    if (!layout.isTransposeOf(sourceLayout, permutation))
+      return rewriter.notifyMatchFailure(
+          op, "Result layout is not a valid transpose of source layout "
+              "according to permutation");
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(wgShape, layout).first;
+    VectorType newResultType =
+        VectorType::get(sgShape, resultType.getElementType());
+    SmallVector<Value> newTransposeOps;
+    for (auto src : adaptor.getVector()) {
+      auto newTranspose = vector::TransposeOp::create(
+          rewriter, op.getLoc(), newResultType, src, permutation);
+      xegpu::setDistributeLayoutAttr(newTranspose->getResult(0),
+                                     layout.dropSgLayoutAndData());
+      newTransposeOps.push_back(newTranspose.getResult());
+    }
+
+    rewriter.replaceOpWithMultiple(op, {newTransposeOps});
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir {
@@ -1231,7 +1297,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
            WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
            WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp,
            WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp,
-           WgToSgMultiDimReductionOp>(patterns.getContext());
+           WgToSgMultiDimReductionOp, WgToSgVectorTransposeOp>(
+          patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -1358,7 +1425,9 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(layout);
       });
 
-  target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp>(
+  target.addDynamicallyLegalOp<vector::ShapeCastOp, vector::StepOp,
+                               vector::TransposeOp, vector::BroadcastOp,
+                               vector::MultiDimReductionOp>(
       [=](Operation *op) -> bool {
         // Check for either a SliceAttr or LayoutAttr on the result.
         auto layout = xegpu::getDistributeLayoutAttr(op->getResult(0));
@@ -1377,16 +1446,6 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(layout);
       });
 
-  target.addDynamicallyLegalOp<vector::BroadcastOp>(
-      [=](vector::BroadcastOp op) -> bool {
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
-      });
-
-  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
-      [=](vector::MultiDimReductionOp op) -> bool {
-        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
-      });
-
   target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
       [=](xegpu::ConvertLayoutOp op) -> bool {
         return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index a38993e0c55b1..de9e09d427665 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -144,6 +144,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) {
     std::string layoutName = getLayoutName(result);
     if (defOp->hasAttr(layoutName))
       return defOp->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+
+    // check for "permament" layout only after "temporary" layout name lookup
+    // for backward compatibility
+    if (auto loadGatherOp = dyn_cast<xegpu::LoadGatherOp>(defOp))
+      return loadGatherOp.getLayoutAttr();
   }
 
   if (auto arg = dyn_cast<BlockArgument>(value)) {
@@ -171,27 +176,77 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) {
   std::string layoutName = xegpu::getLayoutName(opr);
   if (op->hasAttr(layoutName))
     return op->getAttrOfType<xegpu::DistributeLayoutAttr>(layoutName);
+
+  // check for "permament" layout only after "temporary" layout name lookup
+  if (auto storeScatterOp = dyn_cast<xegpu::StoreScatterOp>(op))
+    if (auto layout = storeScatterOp.getLayoutAttr())
+      return layout;
+
   return getDistributeLayoutAttr(opr.get());
 }
 
+// Returns the permanent layout attribute for the given result if it's
+// available on the defining op. Otherwise returns the provided layout.
+xegpu::DistributeLayoutAttr
+maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
+                         const OpResult &result, mlir::Operation *owner,
+                         const std::string &name) {
+  xegpu::DistributeLayoutAttr candidate = layout;
+
+  if (auto loadOp = dyn_cast<xegpu::LoadGatherOp>(owner)) {
+    if (auto perm = loadOp.getLayoutAttr())
+      candidate = perm;
+  }
+
+  return candidate;
+}
+
+// Returns the permanent layout attribute for the given operand if it's
+// available on the defining op. Otherwise returns the provided layout.
+xegpu::DistributeLayoutAttr
+maybePickPermanentLayout(xegpu::DistributeLayoutAttr layout,
+                         const OpOperand &operand, mlir::Operation *owner,
+                         const std::string &name) {
+  xegpu::DistributeLayoutAttr candidate = layout;
+  unsigned idx = const_cast<OpOperand &>(operand).getOperandNumber();
+
+  if (auto storeOp = dyn_cast<xegpu::StoreScatterOp>(owner)) {
+    if (idx == 0) {
+      if (auto perm = storeOp.getLayoutAttr())
+        candidate = perm;
+    }
+  }
+
+  return candidate;
+}
+
 template <typename T, typename>
 void xegpu::setDistributeLayoutAttr(const T &operandOrResult,
-                                    const DistributeLayoutAttr layout) {
+                                    const DistributeLayoutAttr layout,
+                                    bool respectPermLayout) {
   Operation *owner = operandOrResult.getOwner();
   std::string name = xegpu::getLayoutName(operandOrResult);
-  if (layout && !owner->hasAttrOfType<DistributeLayoutAttr>(name))
-    owner->setAttr(name, layout);
+
+  if (owner->hasAttrOfType<DistributeLayoutAttr>(name))
+    return;
+
+  DistributeLayoutAttr candidate = layout;
+  if (respectPermLayout)
+    candidate = maybePickPermanentLayout(layout, operandOrResult, owner, name);
+
+  if (candidate)
+    owner->setAttr(name, candidate);
 }
 
 // Explicit instantiation for OpResult
 template void xegpu::setDistributeLayoutAttr<mlir::OpResult>(
     const mlir::OpResult &result,
-    const mlir::xegpu::DistributeLayoutAttr layout);
+    const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
 
 // Explicit instantiation for OpOperand
 template void xegpu::setDistributeLayoutAttr<mlir::OpOperand>(
     const mlir::OpOperand &operand,
-    const mlir::xegpu::DistributeLayoutAttr layout);
+    const mlir::xegpu::DistributeLayoutAttr layout, bool respectPermLayout);
 
 void xegpu::setDistributeLayoutAttrs(
     Operation *op, function_ref<DistributeLayoutAttr(Value)> getLayoutImpl) {
@@ -500,3 +555,29 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc,
   results.append(addElementwise(builder, loc, a, b));
   return results;
 }
+
+template <typename T>
+int xegpu::getLargestDivisor(T dim, ArrayRef<T> candidates,
+                             ArrayRef<T> candidateMultiples) {
+  static_assert(std::is_integral<T>::value, "T must be an integer type");
+  int largest = -1;
+  SmallVector<T> multiples = {1};
+  if (!candidateMultiples.empty())
+    multiples =
+        SmallVector<T>(candidateMultiples.begin(), candidateMultiples.end());
+  for (T candidate : candidates) {
+    for (T multiple : multiples) {
+      int value = static_cast<int>(candidate * multiple);
+      if (value != 0 && dim % value == 0 && value > largest)
+        largest = value;
+    }
+  }
+  return largest;
+}
+
+/// Explicit instantiations
+template int xegpu::getLargestDivisor<int>(int dim, ArrayRef<int> candidates,
+                                           ArrayRef<int> candidateMultiples);
+template int
+xegpu::getLargestDivisor<unsigned>(unsigned dim, ArrayRef<unsigned> candidates,
+                                   ArrayRef<unsigned> candidateMultiples);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
index 3d86b09b32538..0964e1b8c5ef3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.cpp
@@ -36,9 +36,6 @@ using mlir::LLVM::detail::createIntrinsicCall;
 static llvm::Intrinsic::ID getReduxIntrinsicId(llvm::Type *resultType,
                                                NVVM::ReduxKind kind,
                                                bool hasAbs, bool hasNaN) {
-  if (!(resultType->isIntegerTy(32) || resultType->isFloatTy()))
-    llvm_unreachable("unsupported data type for redux");
-
   switch (kind) {
   case NVVM::ReduxKind::ADD:
     return llvm::Intrinsic::nvvm_redux_sync_add;
diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp
index 3a23bbfd70eac..f8c38fadbd229 100644
--- a/mlir/lib/Transforms/Utils/DialectConversion.cpp
+++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp
@@ -92,6 +92,22 @@ static OpBuilder::InsertPoint computeInsertPoint(ArrayRef<Value> vals) {
   return pt;
 }
 
+namespace {
+enum OpConversionMode {
+  /// In this mode, the conversion will ignore failed conversions to allow
+  /// illegal operations to co-exist in the IR.
+  Partial,
+
+  /// In this mode, all operations must be legal for the given target for the
+  /// conversion to succeed.
+  Full,
+
+  /// In this mode, operations are analyzed for legality. No actual rewrites are
+  /// applied to the operations on success.
+  Analysis,
+};
+} // namespace
+
 //===----------------------------------------------------------------------===//
 // ConversionValueMapping
 //===----------------------------------------------------------------------===//
@@ -866,8 +882,9 @@ namespace mlir {
 namespace detail {
 struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   explicit ConversionPatternRewriterImpl(ConversionPatternRewriter &rewriter,
-                                         const ConversionConfig &config)
-      : rewriter(rewriter), config(config),
+                                         const ConversionConfig &config,
+                                         OperationConverter &opConverter)
+      : rewriter(rewriter), config(config), opConverter(opConverter),
         notifyingRewriter(rewriter.getContext(), config.listener) {}
 
   //===--------------------------------------------------------------------===//
@@ -1105,10 +1122,6 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// A set of operations that were modified by the current pattern.
   SetVector<Operation *> patternModifiedOps;
 
-  /// A set of blocks that were inserted (newly-created blocks or moved blocks)
-  /// by the current pattern.
-  SetVector<Block *> patternInsertedBlocks;
-
   /// A list of unresolved materializations that were created by the current
   /// pattern.
   DenseSet<UnrealizedConversionCastOp> patternMaterializations;
@@ -1128,6 +1141,9 @@ struct ConversionPatternRewriterImpl : public RewriterBase::Listener {
   /// Dialect conversion configuration.
   const ConversionConfig &config;
 
+  /// The operation converter to use for recursive legalization.
+  OperationConverter &opConverter;
+
   /// A set of erased operations. This set is utilized only if
   /// `allowPatternRollback` is set to "false". Conceptually, this set is
   /// similar to `replacedOps` (which is maintained when the flag is set to
@@ -2046,8 +2062,6 @@ void ConversionPatternRewriterImpl::notifyBlockInserted(
   if (!config.allowPatternRollback && config.listener)
     config.listener->notifyBlockInserted(block, previous, previousIt);
 
-  patternInsertedBlocks.insert(block);
-
   if (wasDetached) {
     // If the block was detached, it is most likely a newly created block.
     if (config.allowPatternRollback) {
@@ -2090,9 +2104,10 @@ void ConversionPatternRewriterImpl::notifyMatchFailure(
 //===----------------------------------------------------------------------===//
 
 ConversionPatternRewriter::ConversionPatternRewriter(
-    MLIRContext *ctx, const ConversionConfig &config)
-    : PatternRewriter(ctx),
-      impl(new detail::ConversionPatternRewriterImpl(*this, config)) {
+    MLIRContext *ctx, const ConversionConfig &config,
+    OperationConverter &opConverter)
+    : PatternRewriter(ctx), impl(new detail::ConversionPatternRewriterImpl(
+                                *this, config, opConverter)) {
   setListener(impl.get());
 }
 
@@ -2213,6 +2228,37 @@ ConversionPatternRewriter::getRemappedValues(ValueRange keys,
   return success();
 }
 
+LogicalResult ConversionPatternRewriter::legalize(Region *r) {
+  // Fast path: If the region is empty, there is nothing to legalize.
+  if (r->empty())
+    return success();
+
+  // Gather a list of all operations to legalize. This is done before
+  // converting the entry block signature because unrealized_conversion_cast
+  // ops should not be included.
+  SmallVector<Operation *> ops;
+  for (Block &b : *r)
+    for (Operation &op : b)
+      ops.push_back(&op);
+
+  // If the current pattern runs with a type converter, convert the entry block
+  // signature.
+  if (const TypeConverter *converter = impl->currentTypeConverter) {
+    std::optional<TypeConverter::SignatureConversion> conversion =
+        converter->convertBlockSignature(&r->front());
+    if (!conversion)
+      return failure();
+    applySignatureConversion(&r->front(), *conversion, converter);
+  }
+
+  // Legalize all operations in the region.
+  for (Operation *op : ops)
+    if (failed(legalize(op)))
+      return failure();
+
+  return success();
+}
+
 void ConversionPatternRewriter::inlineBlockBefore(Block *source, Block *dest,
                                                   Block::iterator before,
                                                   ValueRange argValues) {
@@ -2399,17 +2445,12 @@ class OperationLegalizer {
   bool canApplyPattern(Operation *op, const Pattern &pattern);
 
   /// Legalize the resultant IR after successfully applying the given pattern.
-  LogicalResult legalizePatternResult(Operation *op, const Pattern &pattern,
-                                      const RewriterState &curState,
-                                      const SetVector<Operation *> &newOps,
-                                      const SetVector<Operation *> &modifiedOps,
-                                      const SetVector<Block *> &insertedBlocks);
-
-  /// Legalizes the actions registered during the execution of a pattern.
   LogicalResult
-  legalizePatternBlockRewrites(Operation *op,
-                               const SetVector<Block *> &insertedBlocks,
-                               const SetVector<Operation *> &newOps);
+  legalizePatternResult(Operation *op, const Pattern &pattern,
+                        const RewriterState &curState,
+                        const SetVector<Operation *> &newOps,
+                        const SetVector<Operation *> &modifiedOps);
+
   LogicalResult
   legalizePatternCreatedOperations(const SetVector<Operation *> &newOps);
   LogicalResult
@@ -2608,7 +2649,6 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) {
   auto cleanup = llvm::make_scope_exit([&]() {
     rewriterImpl.patternNewOps.clear();
     rewriterImpl.patternModifiedOps.clear();
-    rewriterImpl.patternInsertedBlocks.clear();
   });
 
   // Upon failure, undo all changes made by the folder.
@@ -2662,24 +2702,16 @@ LogicalResult OperationLegalizer::legalizeWithFold(Operation *op) {
 static void
 reportNewIrLegalizationFatalError(const Pattern &pattern,
                                   const SetVector<Operation *> &newOps,
-                                  const SetVector<Operation *> &modifiedOps,
-                                  const SetVector<Block *> &insertedBlocks) {
+                                  const SetVector<Operation *> &modifiedOps) {
   auto newOpNames = llvm::map_range(
       newOps, [](Operation *op) { return op->getName().getStringRef(); });
   auto modifiedOpNames = llvm::map_range(
       modifiedOps, [](Operation *op) { return op->getName().getStringRef(); });
-  StringRef detachedBlockStr = "(detached block)";
-  auto insertedBlockNames = llvm::map_range(insertedBlocks, [&](Block *block) {
-    if (block->getParentOp())
-      return block->getParentOp()->getName().getStringRef();
-    return detachedBlockStr;
-  });
-  llvm::report_fatal_error(
-      "pattern '" + pattern.getDebugName() +
-      "' produced IR that could not be legalized. " + "new ops: {" +
-      llvm::join(newOpNames, ", ") + "}, " + "modified ops: {" +
-      llvm::join(modifiedOpNames, ", ") + "}, " + "inserted block into ops: {" +
-      llvm::join(insertedBlockNames, ", ") + "}");
+  llvm::report_fatal_error("pattern '" + pattern.getDebugName() +
+                           "' produced IR that could not be legalized. " +
+                           "new ops: {" + llvm::join(newOpNames, ", ") + "}, " +
+                           "modified ops: {" +
+                           llvm::join(modifiedOpNames, ", ") + "}");
 }
 
 LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
@@ -2743,7 +2775,6 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
     }
     rewriterImpl.patternNewOps.clear();
     rewriterImpl.patternModifiedOps.clear();
-    rewriterImpl.patternInsertedBlocks.clear();
     LLVM_DEBUG({
       logFailure(rewriterImpl.logger, "pattern failed to match");
       if (rewriterImpl.config.notifyCallback) {
@@ -2777,15 +2808,12 @@ LogicalResult OperationLegalizer::legalizeWithPattern(Operation *op) {
     SetVector<Operation *> newOps = moveAndReset(rewriterImpl.patternNewOps);
     SetVector<Operation *> modifiedOps =
         moveAndReset(rewriterImpl.patternModifiedOps);
-    SetVector<Block *> insertedBlocks =
-        moveAndReset(rewriterImpl.patternInsertedBlocks);
-    auto result = legalizePatternResult(op, pattern, curState, newOps,
-                                        modifiedOps, insertedBlocks);
+    auto result =
+        legalizePatternResult(op, pattern, curState, newOps, modifiedOps);
     appliedPatterns.erase(&pattern);
     if (failed(result)) {
       if (!rewriterImpl.config.allowPatternRollback)
-        reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps,
-                                          insertedBlocks);
+        reportNewIrLegalizationFatalError(pattern, newOps, modifiedOps);
       rewriterImpl.resetState(curState, pattern.getDebugName());
     }
     if (config.listener)
@@ -2823,8 +2851,7 @@ bool OperationLegalizer::canApplyPattern(Operation *op,
 LogicalResult OperationLegalizer::legalizePatternResult(
     Operation *op, const Pattern &pattern, const RewriterState &curState,
     const SetVector<Operation *> &newOps,
-    const SetVector<Operation *> &modifiedOps,
-    const SetVector<Block *> &insertedBlocks) {
+    const SetVector<Operation *> &modifiedOps) {
   [[maybe_unused]] auto &impl = rewriter.getImpl();
   assert(impl.pendingRootUpdates.empty() && "dangling root updates");
 
@@ -2843,8 +2870,7 @@ LogicalResult OperationLegalizer::legalizePatternResult(
 #endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS
 
   // Legalize each of the actions registered during application.
-  if (failed(legalizePatternBlockRewrites(op, insertedBlocks, newOps)) ||
-      failed(legalizePatternRootUpdates(modifiedOps)) ||
+  if (failed(legalizePatternRootUpdates(modifiedOps)) ||
       failed(legalizePatternCreatedOperations(newOps))) {
     return failure();
   }
@@ -2853,53 +2879,6 @@ LogicalResult OperationLegalizer::legalizePatternResult(
   return success();
 }
 
-LogicalResult OperationLegalizer::legalizePatternBlockRewrites(
-    Operation *op, const SetVector<Block *> &insertedBlocks,
-    const SetVector<Operation *> &newOps) {
-  ConversionPatternRewriterImpl &impl = rewriter.getImpl();
-  SmallPtrSet<Operation *, 16> alreadyLegalized;
-
-  // If the pattern moved or created any blocks, make sure the types of block
-  // arguments get legalized.
-  for (Block *block : insertedBlocks) {
-    if (impl.erasedBlocks.contains(block))
-      continue;
-
-    // Only check blocks outside of the current operation.
-    Operation *parentOp = block->getParentOp();
-    if (!parentOp || parentOp == op || block->getNumArguments() == 0)
-      continue;
-
-    // If the region of the block has a type converter, try to convert the block
-    // directly.
-    if (auto *converter = impl.regionToConverter.lookup(block->getParent())) {
-      std::optional<TypeConverter::SignatureConversion> conversion =
-          converter->convertBlockSignature(block);
-      if (!conversion) {
-        LLVM_DEBUG(logFailure(impl.logger, "failed to convert types of moved "
-                                           "block"));
-        return failure();
-      }
-      impl.applySignatureConversion(block, converter, *conversion);
-      continue;
-    }
-
-    // Otherwise, try to legalize the parent operation if it was not generated
-    // by this pattern. This is because we will attempt to legalize the parent
-    // operation, and blocks in regions created by this pattern will already be
-    // legalized later on.
-    if (!newOps.count(parentOp) && alreadyLegalized.insert(parentOp).second) {
-      if (failed(legalize(parentOp))) {
-        LLVM_DEBUG(logFailure(
-            impl.logger, "operation '{0}'({1}) became illegal after rewrite",
-            parentOp->getName(), parentOp));
-        return failure();
-      }
-    }
-  }
-  return success();
-}
-
 LogicalResult OperationLegalizer::legalizePatternCreatedOperations(
     const SetVector<Operation *> &newOps) {
   for (Operation *op : newOps) {
@@ -3265,22 +3244,6 @@ static void reconcileUnrealizedCasts(
 // OperationConverter
 //===----------------------------------------------------------------------===//
 
-namespace {
-enum OpConversionMode {
-  /// In this mode, the conversion will ignore failed conversions to allow
-  /// illegal operations to co-exist in the IR.
-  Partial,
-
-  /// In this mode, all operations must be legal for the given target for the
-  /// conversion to succeed.
-  Full,
-
-  /// In this mode, operations are analyzed for legality. No actual rewrites are
-  /// applied to the operations on success.
-  Analysis,
-};
-} // namespace
-
 namespace mlir {
 // This class converts operations to a given conversion target via a set of
 // rewrite patterns. The conversion behaves differently depending on the
@@ -3290,16 +3253,20 @@ struct OperationConverter {
                               const FrozenRewritePatternSet &patterns,
                               const ConversionConfig &config,
                               OpConversionMode mode)
-      : rewriter(ctx, config), opLegalizer(rewriter, target, patterns),
+      : rewriter(ctx, config, *this), opLegalizer(rewriter, target, patterns),
         mode(mode) {}
 
   /// Converts the given operations to the conversion target.
   LogicalResult convertOperations(ArrayRef<Operation *> ops);
 
-private:
-  /// Converts an operation with the given rewriter.
-  LogicalResult convert(Operation *op);
+  /// Converts a single operation. If `isRecursiveLegalization` is "true", the
+  /// conversion is a recursive legalization request, triggered from within a
+  /// pattern. In that case, do not emit errors because there will be another
+  /// attempt at legalizing the operation later (via the regular pre-order
+  /// legalization mechanism).
+  LogicalResult convert(Operation *op, bool isRecursiveLegalization = false);
 
+private:
   /// The rewriter to use when converting operations.
   ConversionPatternRewriter rewriter;
 
@@ -3311,32 +3278,42 @@ struct OperationConverter {
 };
 } // namespace mlir
 
-LogicalResult OperationConverter::convert(Operation *op) {
+LogicalResult ConversionPatternRewriter::legalize(Operation *op) {
+  return impl->opConverter.convert(op, /*isRecursiveLegalization=*/true);
+}
+
+LogicalResult OperationConverter::convert(Operation *op,
+                                          bool isRecursiveLegalization) {
   const ConversionConfig &config = rewriter.getConfig();
 
   // Legalize the given operation.
   if (failed(opLegalizer.legalize(op))) {
     // Handle the case of a failed conversion for each of the different modes.
     // Full conversions expect all operations to be converted.
-    if (mode == OpConversionMode::Full)
-      return op->emitError()
-             << "failed to legalize operation '" << op->getName() << "'";
+    if (mode == OpConversionMode::Full) {
+      if (!isRecursiveLegalization)
+        op->emitError() << "failed to legalize operation '" << op->getName()
+                        << "'";
+      return failure();
+    }
     // Partial conversions allow conversions to fail iff the operation was not
     // explicitly marked as illegal. If the user provided a `unlegalizedOps`
     // set, non-legalizable ops are added to that set.
     if (mode == OpConversionMode::Partial) {
-      if (opLegalizer.isIllegal(op))
-        return op->emitError()
-               << "failed to legalize operation '" << op->getName()
-               << "' that was explicitly marked illegal";
-      if (config.unlegalizedOps)
+      if (opLegalizer.isIllegal(op)) {
+        if (!isRecursiveLegalization)
+          op->emitError() << "failed to legalize operation '" << op->getName()
+                          << "' that was explicitly marked illegal";
+        return failure();
+      }
+      if (config.unlegalizedOps && !isRecursiveLegalization)
         config.unlegalizedOps->insert(op);
     }
   } else if (mode == OpConversionMode::Analysis) {
     // Analysis conversions don't fail if any operations fail to legalize,
     // they are only interested in the operations that were successfully
     // legalized.
-    if (config.legalizableOps)
+    if (config.legalizableOps && !isRecursiveLegalization)
       config.legalizableOps->insert(op);
   }
   return success();
@@ -3800,10 +3777,11 @@ static LogicalResult convertFuncOpTypes(FunctionOpInterface funcOp,
   TypeConverter::SignatureConversion result(type.getNumInputs());
   SmallVector<Type, 1> newResults;
   if (failed(typeConverter.convertSignatureArgs(type.getInputs(), result)) ||
-      failed(typeConverter.convertTypes(type.getResults(), newResults)) ||
-      failed(rewriter.convertRegionTypes(&funcOp.getFunctionBody(),
-                                         typeConverter, &result)))
+      failed(typeConverter.convertTypes(type.getResults(), newResults)))
     return failure();
+  if (!funcOp.getFunctionBody().empty())
+    rewriter.applySignatureConversion(&funcOp.getFunctionBody().front(), result,
+                                      &typeConverter);
 
   // Update the function signature in-place.
   auto newType = FunctionType::get(rewriter.getContext(),
diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py
index 14c7380e432f0..d9ab504f0de54 100644
--- a/mlir/python/mlir/dialects/transform/structured.py
+++ b/mlir/python/mlir/dialects/transform/structured.py
@@ -713,6 +713,8 @@ def __init__(
         disable_transfer_permutation_map_lowering_patterns: bool = False,
         vectorize_nd_extract: bool = False,
         vectorize_padding: bool = False,
+        flatten_1d_depthwise_conv: bool = False,
+        fold_type_extensions_into_contract: bool = False,
         loc=None,
         ip=None,
     ):
@@ -722,8 +724,10 @@ def __init__(
             target,
             disable_multi_reduction_to_contract_patterns=disable_multi_reduction_to_contract_patterns,
             disable_transfer_permutation_map_lowering_patterns=disable_transfer_permutation_map_lowering_patterns,
+            flatten_1d_depthwise_conv=flatten_1d_depthwise_conv,
             vectorize_nd_extract=vectorize_nd_extract,
             vectorize_padding=vectorize_padding,
+            fold_type_extensions_into_contract=fold_type_extensions_into_contract,
             loc=loc,
             ip=ip,
         )
diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index ba12ff29ebef9..b5dcb01d3dc6b 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -747,3 +747,29 @@ func.func @memref_bitcast(%1: memref<?xi16>) -> memref<?xbf16> {
   %2 = arith.bitcast %1 : memref<?xi16> to memref<?xbf16>
   func.return %2 : memref<?xbf16>
 }
+
+// -----
+
+// CHECK-LABEL: func @unsupported_fp_type
+//       CHECK:   arith.addf {{.*}} : f4E2M1FN
+//       CHECK:   arith.addf {{.*}} : vector<4xf4E2M1FN>
+//       CHECK:   arith.addf {{.*}} : vector<8x4xf4E2M1FN>
+func.func @unsupported_fp_type(%arg0: f4E2M1FN, %arg1: vector<4xf4E2M1FN>, %arg2: vector<8x4xf4E2M1FN>) -> (f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN>) {
+  %0 = arith.addf %arg0, %arg0 : f4E2M1FN
+  %1 = arith.addf %arg1, %arg1 : vector<4xf4E2M1FN>
+  %2 = arith.addf %arg2, %arg2 : vector<8x4xf4E2M1FN>
+  return %0, %1, %2 : f4E2M1FN, vector<4xf4E2M1FN>, vector<8x4xf4E2M1FN>
+}
+
+// -----
+
+//   CHECK-LABEL: func @supported_fp_type
+//         CHECK:   llvm.fadd {{.*}} : f32
+//         CHECK:   llvm.fadd {{.*}} : vector<4xf32>
+// CHECK-COUNT-4:   llvm.fadd {{.*}} : vector<8xf32>
+func.func @supported_fp_type(%arg0: f32, %arg1: vector<4xf32>, %arg2: vector<4x8xf32>) -> (f32, vector<4xf32>, vector<4x8xf32>) {
+  %0 = arith.addf %arg0, %arg0 : f32
+  %1 = arith.addf %arg1, %arg1 : vector<4xf32>
+  %2 = arith.addf %arg2, %arg2 : vector<4x8xf32>
+  return %0, %1, %2 : f32, vector<4xf32>, vector<4x8xf32>
+}
diff --git a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
index a75f30d57fa74..cd8cfc8736915 100644
--- a/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
+++ b/mlir/test/Conversion/ConvertToSPIRV/vector.mlir
@@ -275,6 +275,42 @@ func.func @reduction_minimumf(%v : vector<3xf32>, %s: f32) -> f32 {
 
 // -----
 
+// CHECK-LABEL:   spirv.func @reduction_minnumf(
+// CHECK-SAME:      %[[V:.*]]: vector<3xf32>,
+// CHECK-SAME:      %[[S:.*]]: f32) -> f32 "None" {
+// CHECK:           %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+// CHECK:           %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+// CHECK:           %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+// CHECK:           %[[MIN0:.*]] = spirv.GL.FMin %[[S0]], %[[S1]] : f32
+// CHECK:           %[[MIN1:.*]] = spirv.GL.FMin %[[MIN0]], %[[S2]] : f32
+// CHECK:           %[[MIN2:.*]] = spirv.GL.FMin %[[MIN1]], %[[S]] : f32
+// CHECK:           spirv.ReturnValue %[[MIN2]] : f32
+// CHECK:         }
+func.func @reduction_minnumf(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <minnumf>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
+// CHECK-LABEL:   spirv.func @reduction_maxnumf(
+// CHECK-SAME:      %[[V:.*]]: vector<3xf32>,
+// CHECK-SAME:      %[[S:.*]]: f32) -> f32 "None" {
+// CHECK:           %[[S0:.*]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xf32>
+// CHECK:           %[[S1:.*]] = spirv.CompositeExtract %[[V]][1 : i32] : vector<3xf32>
+// CHECK:           %[[S2:.*]] = spirv.CompositeExtract %[[V]][2 : i32] : vector<3xf32>
+// CHECK:           %[[MAX0:.*]] = spirv.GL.FMax %[[S0]], %[[S1]] : f32
+// CHECK:           %[[MAX1:.*]] = spirv.GL.FMax %[[MAX0]], %[[S2]] : f32
+// CHECK:           %[[MAX2:.*]] = spirv.GL.FMax %[[MAX1]], %[[S]] : f32
+// CHECK:           spirv.ReturnValue %[[MAX2]] : f32
+// CHECK:         }
+func.func @reduction_maxnumf(%v : vector<3xf32>, %s: f32) -> f32 {
+  %reduce = vector.reduction <maxnumf>, %v, %s : vector<3xf32> into f32
+  return %reduce : f32
+}
+
+// -----
+
 // CHECK-LABEL: func @reduction_maxsi
 //  CHECK-SAME: (%[[V:.+]]: vector<3xi32>, %[[S:.+]]: i32)
 //       CHECK:   %[[S0:.+]] = spirv.CompositeExtract %[[V]][0 : i32] : vector<3xi32>
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 8cce6308018e2..dcf4ddb2dd48c 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -491,12 +491,12 @@ func.func @mbarrier() {
 
   // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.shared %[[barPtr2]]
+  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive %[[barPtr2]]
   %token = nvgpu.mbarrier.arrive %barrier[%c0] : !barrierType -> !tokenType
 
   // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
+  // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]]
   %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType
 
   func.return
@@ -521,12 +521,12 @@ func.func @mbarrier_nocomplete() {
 
   // CHECK: %[[base2:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr2:.+]] = llvm.getelementptr %[[base2]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete.shared %[[barPtr2]]
+  // CHECK: %[[token:.+]] = nvvm.mbarrier.arrive.nocomplete %[[barPtr2]]
   %token = nvgpu.mbarrier.arrive.nocomplete %barrier[%c0], %count : !barrierType -> !tokenType
 
   // CHECK: %[[base3:.+]] = llvm.extractvalue %[[barStr]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
   // CHECK: %[[barPtr3:.+]] = llvm.getelementptr %[[base3]][%[[mid]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-  // CHECK: nvvm.mbarrier.test.wait.shared %[[barPtr3]], %[[token]]
+  // CHECK: nvvm.mbarrier.test.wait %[[barPtr3]], %[[token]]
   %isDone = nvgpu.mbarrier.test.wait %barrier[%c0], %token : !barrierType, !tokenType
 
   func.return
@@ -572,7 +572,7 @@ func.func @mbarrier_wait(%barriers : !nvgpu.mbarrier.group<memorySpace = #gpu.ad
 // CHECK: %[[S3:.+]] = builtin.unrealized_conversion_cast %[[S2]] : index to i64
 // CHECK: %[[S4:.+]] = llvm.extractvalue %[[CARG0]][1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
 // CHECK: %[[S5:.+]] = llvm.getelementptr %[[S4]][%[[S3]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i64
-// CHECK: nvvm.mbarrier.test.wait.shared {{.*}}, %[[CARG1]]
+// CHECK: nvvm.mbarrier.test.wait {{.*}}, %[[CARG1]]
     %mbarId = arith.remui %i, %numBarriers : index
     %isDone = nvgpu.mbarrier.test.wait %barriers[%mbarId], %token : !nvgpu.mbarrier.group<memorySpace = #gpu.address_space<workgroup>, num_barriers = 5>, !tokenType
   }
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index fbc4c0af60360..a9356c5cb60bb 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -88,10 +88,10 @@ func.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p
   nvvm.cp.async.mbarrier.arrive %bar_gen : !llvm.ptr
   // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
   nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
-  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}}
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
-  // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true}
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
+  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}}
+  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
+  // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true}
+  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
   llvm.return
 }
 
diff --git a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
index 9908205f07c92..ae5141db16c09 100644
--- a/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/load-to-xegpu.mlir
@@ -9,11 +9,12 @@ func.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vecto
 // CHECK-LABEL: @load_1D_vector(
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // CHECK-SAME:    boundary_check = false
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
@@ -28,35 +29,29 @@ func.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // CHECK-LABEL: @load_2D_vector(
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
 
 func.func @load_dynamic_source(%source: memref<?x?x?xf32>,
-    %offset: index) -> vector<8x16xf32> {
-  %0 = vector.load %source[%offset, %offset, %offset]
+    %i: index, %j: index, %k: index) -> vector<8x16xf32> {
+  %0 = vector.load %source[%i, %j, %k]
     : memref<?x?x?xf32>, vector<8x16xf32>
   return %0 : vector<8x16xf32>
 }
 
 // CHECK-LABEL: @load_dynamic_source(
 // CHECK-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// CHECK-SAME:  %[[OFFSET:.+]]: index
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// CHECK-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// CHECK:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
@@ -72,9 +67,9 @@ func.func @load_out_of_bounds(%source: memref<7x15xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<7x15xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// CHECK-SAME:    %[[SRC]]
 // CHECK-SAME:    memref<7x15xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// CHECK:       %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // CHECK:       return %[[VEC]]
 
 // -----
diff --git a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
index 2c498dcc2a071..1a10d917623cc 100644
--- a/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/store-to-xegpu.mlir
@@ -11,11 +11,12 @@ func.func @store_1D_vector(%vec: vector<8xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<8xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // CHECK-SAME:    boundary_check = false
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32>
 
 // -----
 
@@ -30,16 +31,17 @@ func.func @store_2D_vector(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK-SAME:    %[[COLLAPSED]]
+// CHECK-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // -----
 
 func.func @store_dynamic_source(%vec: vector<8x16xf32>,
-    %source: memref<?x?x?xf32>, %offset: index) {
-  vector.store %vec, %source[%offset, %offset, %offset]
+    %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) {
+  vector.store %vec, %source[%i, %j, %k]
     : memref<?x?x?xf32>, vector<8x16xf32>
   return
 }
@@ -47,18 +49,11 @@ func.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // CHECK-LABEL: @store_dynamic_source(
 // CHECK-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// CHECK-SAME:  %[[OFFSET:.+]]: index
-// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// CHECK-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// CHECK-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// CHECK-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// CHECK:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// CHECK-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// CHECK-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// CHECK:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// CHECK:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32>
 
 // -----
 
@@ -74,9 +69,9 @@ func.func @store_out_of_bounds(%vec: vector<8x16xf32>,
 // CHECK-SAME:  %[[SRC:.+]]: memref<7x64xf32>,
 // CHECK-SAME:  %[[OFFSET:.+]]: index
 // CHECK:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// CHECK-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// CHECK-SAME:    %[[SRC]]
 // CHECK-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// CHECK:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // -----
 
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
index c4ca79af1bd9a..c87a5304babfe 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-read-to-xegpu.mlir
@@ -12,11 +12,12 @@ gpu.func @load_1D_vector(%source: memref<8x16x32xf32>, %offset: index) -> vector
 // LOAD-ND-LABEL:  @load_1D_vector(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// LOAD-ND-SAME:     memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]]]{{.*}}-> vector<8xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_1D_vector(
@@ -46,11 +47,12 @@ gpu.func @load_2D_vector(%source: memref<8x16x32xf32>,
 // LOAD-ND-LABEL:  @load_2D_vector(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<8x16x32xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// LOAD-ND-SAME:     memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_2D_vector(
@@ -83,9 +85,9 @@ gpu.func @load_zero_pad_out_of_bounds(%source: memref<32x64xf32>,
 // LOAD-ND-LABEL:  @load_zero_pad_out_of_bounds(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[OFFSET:.+]]: index
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
 // LOAD-ND-SAME:     memref<32x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET]], %[[OFFSET]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_zero_pad_out_of_bounds(
@@ -109,9 +111,9 @@ gpu.func @load_transposed(%source: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<32x64xf32>,
 // LOAD-ND-SAME:   %[[OFFSET1:.+]]: index, 
 // LOAD-ND-SAME:   %[[OFFSET2:.+]]: index  
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET1]], %[[OFFSET2]]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]]
 // LOAD-ND-SAME:     memref<32x64xf32> -> !xegpu.tensor_desc<16x8xf32
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]] <{transpose = array<i64: 1, 0>}>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFFSET1]], %[[OFFSET2]]] <{transpose = array<i64: 1, 0>}>
 // LOAD-ND-SAME:     -> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
@@ -143,16 +145,11 @@ gpu.func @load_dynamic_source(%source: memref<?x?x?xf32>,
 }
 // LOAD-ND-LABEL:  @load_dynamic_source(
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<?x?x?xf32>,
-// LOAD-ND-SAME:   %[[OFFSET:.+]]: index
-// LOAD-ND:        %[[C2:.+]] = arith.constant 2 : index
-// LOAD-ND:        %[[C1:.+]] = arith.constant 1 : index
-// LOAD-ND:        %[[C0:.+]] = arith.constant 0 : index
-// LOAD-ND-DAG:    %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// LOAD-ND-DAG:    %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// LOAD-ND-DAG:    %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// LOAD-ND:        %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET:.+]], %[[OFFSET:.+]], %[[OFFSET:.+]]]
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8x16xf32>
+// LOAD-ND-SAME:   %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// LOAD-ND:        {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF1]], %[[OFF2]]]{{.*}}-> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]]
 
 
@@ -184,10 +181,11 @@ gpu.func @load_dynamic_source2(%source: memref<?x8x16xf32>,
 }
 
 // LOAD-ND-LABEL:  @load_dynamic_source2(
-// LOAD-ND-DAG:    %[[C0:.+]] = arith.constant 0 : index
-// LOAD-ND-DAG:    %[[DIM:.+]] = memref.dim %{{.*}}, %[[C0]] : memref<?x8x16xf32>
-// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], shape : [%[[DIM]], 8, 16], strides : [128, 16, 1] : memref<?x8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32>
+// LOAD-ND-SAME:   %[[SRC:.+]]: memref<?x8x16xf32>,
+// LOAD-ND-SAME:   %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc %{{.*}} : memref<8x16xf32, strided<[16, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%{{.*}}, %{{.*}}] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<boundary_check = false>> -> vector<8x16xf32>
 // LOAD-ND:        return %[[VEC]] : vector<8x16xf32>
 
 // LOAD-GATHER-LABEL:  @load_dynamic_source2(
@@ -418,11 +416,12 @@ gpu.func @load_from_subview(%source: memref<4096x4096xf16>, %off1: index, %off2:
 // LOAD-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // LOAD-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
 // LOAD-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>> 
+// LOAD-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0]
 // LOAD-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// LOAD-ND-SAME:     %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]]
-// LOAD-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
+// LOAD-ND-SAME:     %[[COLLAPSED]]
+// LOAD-ND-SAME:     memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
 // LOAD-ND-SAME:     boundary_check = false
-// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]]{{.*}}-> vector<8xf16>
+// LOAD-ND:        %[[VEC:.+]] = xegpu.load_nd %[[DESC]][%[[OFF2]]]{{.*}}-> vector<8xf16>
 // LOAD-ND:        return %[[VEC]]
 
 // LOAD-GATHER-LABEL:  @load_from_subview(
diff --git a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
index fcfc9414da4f6..43a1a7206e2cc 100644
--- a/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
+++ b/mlir/test/Conversion/VectorToXeGPU/transfer-write-to-xegpu.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --xevm-attach-target='module=xevm.* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND
+// RUN: mlir-opt %s --xevm-attach-target='module=xevm_* O=3 chip=pvc' -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-ND
 // RUN: mlir-opt %s -convert-vector-to-xegpu -split-input-file | FileCheck %s --check-prefix=STORE-SCATTER
 
 
@@ -15,11 +15,12 @@ gpu.func @store_1D_vector(%vec: vector<8xf32>,
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], %[[OFFSET]], 0]
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8xf32,
+// STORE-ND-SAME:    %[[COLLAPSED]]
+// STORE-ND-SAME:    memref<32xf32, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf32,
 // STORE-ND-SAME:    boundary_check = false
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]]] : vector<8xf32>
 
 // STORE-SCATTER-LABEL:  @store_1D_vector(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8xf32>,
@@ -49,11 +50,12 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<8x16x32xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFFSET]], 0, 0]
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:    memref<8x16x32xf32> -> !xegpu.tensor_desc<8x16xf32,
+// STORE-ND-SAME:    %[[COLLAPSED]]
+// STORE-ND-SAME:    memref<16x32xf32, strided<[32, 1], offset: ?>> -> !xegpu.tensor_desc<8x16xf32,
 // STORE-ND-SAME:    boundary_check = false
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL:  @store_2D_vector(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8x16xf32>,
@@ -73,8 +75,8 @@ gpu.func @store_2D_vector(%vec: vector<8x16xf32>,
 // -----
 gpu.module @xevm_module {
 gpu.func @store_dynamic_source(%vec: vector<8x16xf32>,
-    %source: memref<?x?x?xf32>, %offset: index) {
-  vector.transfer_write %vec, %source[%offset, %offset, %offset]
+    %source: memref<?x?x?xf32>, %i: index, %j: index, %k: index) {
+  vector.transfer_write %vec, %source[%i, %j, %k]
     {in_bounds = [true, true]}
     : vector<8x16xf32>, memref<?x?x?xf32>
   gpu.return
@@ -83,18 +85,11 @@ gpu.func @store_dynamic_source(%vec: vector<8x16xf32>,
 // STORE-ND-LABEL: @store_dynamic_source(
 // STORE-ND-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<?x?x?xf32>,
-// STORE-ND-SAME:  %[[OFFSET:.+]]: index
-// STORE-ND-DAG:   %[[C0:.+]] = arith.constant 0 : index
-// STORE-ND-DAG:   %[[C1:.+]] = arith.constant 1 : index
-// STORE-ND-DAG:   %[[C2:.+]] = arith.constant 2 : index
-// STORE-ND-DAG:   %[[DIM_0:.+]] = memref.dim %[[SRC]], %[[C0]]
-// STORE-ND-DAG:   %[[DIM_1:.+]] = memref.dim %[[SRC]], %[[C1]]
-// STORE-ND-DAG:   %[[DIM_2:.+]] = memref.dim %[[SRC]], %[[C2]]
-// STORE-ND:       %[[DIM_0_STRIDE:.+]] = arith.muli %[[DIM_2]], %[[DIM_1]]
-// STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[SRC]][%[[OFFSET]], %[[OFFSET]], %[[OFFSET]]]
-// STORE-ND-SAME:  , shape : [%[[DIM_0]], %[[DIM_1]], %[[DIM_2]]], strides : [%[[DIM_0_STRIDE]], %[[DIM_2]], 1]
-// STORE-ND-SAME:    memref<?x?x?xf32> -> !xegpu.tensor_desc<8x16xf32
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND-SAME:  %[[OFF0:.+]]: index, %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
+// STORE-ND:       %[[COLLAPSED:.+]] = memref.subview %[[SRC]][%[[OFF0]], 0, 0]
+// STORE-ND:       {{.*}} %[[SIZES:.+]]:2, %[[STRIDES:.+]]:2 = memref.extract_strided_metadata %[[COLLAPSED]]
+// STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc %[[COLLAPSED]]
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF1]], %[[OFF2]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL: @store_dynamic_source(
 // STORE-SCATTER-SAME:  %[[VEC:.+]]: vector<8x16xf32>,
@@ -126,9 +121,9 @@ gpu.func @store_out_of_bounds(%vec: vector<8x16xf32>,
 // STORE-ND-SAME:  %[[SRC:.+]]: memref<7x64xf32>,
 // STORE-ND-SAME:  %[[OFFSET:.+]]: index
 // STORE-ND:       %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:    %[[SRC]][%[[OFFSET]], %[[OFFSET]]]
+// STORE-ND-SAME:    %[[SRC]]
 // STORE-ND-SAME:    memref<7x64xf32> -> !xegpu.tensor_desc<8x16xf32>
-// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8x16xf32>
+// STORE-ND:       xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFFSET]], %[[OFFSET]]] : vector<8x16xf32>
 
 // STORE-SCATTER-LABEL:  @store_out_of_bounds(
 // STORE-SCATTER:   vector.transfer_write
@@ -298,13 +293,13 @@ gpu.func @store_to_subview(%vec: vector<8xf16>,
 // STORE-ND-SAME:   %[[VEC:.+]]: vector<8xf16>,
 // STORE-ND-SAME:   %[[SRC:.+]]: memref<4096x4096xf16>,
 // STORE-ND-SAME:   %[[OFF1:.+]]: index, %[[OFF2:.+]]: index
-// STORE-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1]
-// STORE-ND-SAME:     : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// STORE-ND:        %[[SUBVIEW:.+]] = memref.subview %[[SRC]][%[[OFF1]], %[[OFF2]]] [256, 256] [1, 1] : memref<4096x4096xf16> to memref<256x256xf16, strided<[4096, 1], offset: ?>>
+// STORE-ND:        %[[COLLAPSED:.+]] = memref.subview %[[SUBVIEW]][%[[OFF2]], 0]
 // STORE-ND:        %[[DESC:.+]] = xegpu.create_nd_tdesc
-// STORE-ND-SAME:     %[[SUBVIEW]][%[[OFF2]], %[[OFF2]]]
-// STORE-ND-SAME:     memref<256x256xf16, strided<[4096, 1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
+// STORE-ND-SAME:     %[[COLLAPSED]]
+// STORE-ND-SAME:     memref<256xf16, strided<[1], offset: ?>> -> !xegpu.tensor_desc<8xf16,
 // STORE-ND-SAME:     boundary_check = false
-// STORE-ND:        xegpu.store_nd %[[VEC]], %[[DESC]] : vector<8xf16>
+// STORE-ND:        xegpu.store_nd %[[VEC]], %[[DESC]][%[[OFF2]]] : vector<8xf16>
 
 // STORE-SCATTER-LABEL:  @store_to_subview(
 // STORE-SCATTER-SAME:   %[[VEC:.+]]: vector<8xf16>,
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
index b616632a6fe24..b062736575ad7 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_reduction.mlir
@@ -243,6 +243,106 @@ func.func @vecdim_reduction_ori(%in: memref<256x512xi32>, %out: memref<256xi32>)
 // CHECK:         affine.store %[[final_red]], %{{.*}} : memref<256xi32>
 // CHECK:       }
 
+// -----
+
+func.func @vecdim_reduction_xori(%in: memref<256x512xi32>, %out: memref<256xi32>) {
+ %cst = arith.constant 0 : i32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (i32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xi32>
+     %xor = arith.xori %red_iter, %ld : i32
+     affine.yield %xor : i32
+   }
+   affine.store %final_red, %out[%i] : memref<256xi32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_xori(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xi32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xi32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0 : i32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0> : vector<128xi32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xi32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : i32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xi32>, vector<128xi32>
+// CHECK:               %[[xor:.*]] = arith.xori %[[red_iter]], %[[ld]] : vector<128xi32>
+// CHECK:               affine.yield %[[xor]] : vector<128xi32>
+// CHECK:             }
+// CHECK:             %[[final_red:.*]] = vector.reduction <xor>, %[[vred]] : vector<128xi32> into i32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xi32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @vecdim_reduction_minnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0xFF800000 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %min = arith.minnumf %red_iter, %ld : f32
+     affine.yield %min : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_minnumf(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xf32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xf32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0xFF800000 : f32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0x7FC00000> : vector<128xf32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : f32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32>
+// CHECK:               %[[min:.*]] = arith.minnumf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:               affine.yield %[[min]] : vector<128xf32>
+// CHECK:             }
+// CHECK:             %[[red_scalar:.*]] = vector.reduction <minnumf>, %[[vred]] : vector<128xf32> into f32
+// CHECK:             %[[final_red:.*]] = arith.minnumf %[[red_scalar]], %[[cst]] : f32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+func.func @vecdim_reduction_maxnumf(%in: memref<256x512xf32>, %out: memref<256xf32>) {
+ %cst = arith.constant 0xFF800000 : f32
+ affine.for %i = 0 to 256 {
+   %final_red = affine.for %j = 0 to 512 iter_args(%red_iter = %cst) -> (f32) {
+     %ld = affine.load %in[%i, %j] : memref<256x512xf32>
+     %max = arith.maxnumf %red_iter, %ld : f32
+     affine.yield %max : f32
+   }
+   affine.store %final_red, %out[%i] : memref<256xf32>
+ }
+ return
+}
+
+// CHECK-LABEL:   func.func @vecdim_reduction_maxnumf(
+// CHECK-SAME:      %[[input:.*]]: memref<256x512xf32>,
+// CHECK-SAME:      %[[output:.*]]: memref<256xf32>) {
+// CHECK:           %[[cst:.*]] = arith.constant 0xFF800000 : f32
+// CHECK:           affine.for %{{.*}} = 0 to 256 {
+// CHECK:             %[[vzero:.*]] = arith.constant dense<0xFFC00000> : vector<128xf32>
+// CHECK:             %[[vred:.*]] = affine.for %{{.*}} = 0 to 512 step 128 iter_args(%[[red_iter:.*]] = %[[vzero]]) -> (vector<128xf32>) {
+// CHECK:               %[[poison:.*]] = ub.poison : f32
+// CHECK:               %[[ld:.*]] = vector.transfer_read %[[input]]{{\[}}%{{.*}}, %{{.*}}], %[[poison]] : memref<256x512xf32>, vector<128xf32>
+// CHECK:               %[[max:.*]] = arith.maxnumf %[[red_iter]], %[[ld]] : vector<128xf32>
+// CHECK:               affine.yield %[[max]] : vector<128xf32>
+// CHECK:             }
+// CHECK:             %[[red_scalar:.*]] = vector.reduction <maxnumf>, %[[vred]] : vector<128xf32> into f32
+// CHECK:             %[[final_red:.*]] = arith.maxnumf %[[red_scalar]], %[[cst]] : f32
+// CHECK:             affine.store %[[final_red]], %[[output]]{{\[}}%{{.*}}] : memref<256xf32>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
 
 // -----
 
diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir
index aaf9f8024bfbe..49b6342aea538 100644
--- a/mlir/test/Dialect/LLVMIR/invalid.mlir
+++ b/mlir/test/Dialect/LLVMIR/invalid.mlir
@@ -664,21 +664,21 @@ func.func @zero_non_llvm_type() {
 // -----
 
 func.func @nvvm_invalid_shfl_pred_1(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected return type to be a two-element struct}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> i32
 }
 
 // -----
 
 func.func @nvvm_invalid_shfl_pred_2(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected return type to be a two-element struct}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32)>
 }
 
 // -----
 
 func.func @nvvm_invalid_shfl_pred_3(%arg0 : i32, %arg1 : i32, %arg2 : i32, %arg3 : i32) {
-  // expected-error@+1 {{expected return type to be a two-element struct with i1 as the second element}}
+  // expected-error@+1 {{expected second element in the returned struct to be of type 'i1' but got 'i32' instead}}
   %0 = nvvm.shfl.sync bfly %arg0, %arg3, %arg1, %arg2 {return_value_and_is_valid} : i32 -> !llvm.struct<(i32, i32)>
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm.mlir b/mlir/test/Dialect/LLVMIR/nvvm.mlir
index 2505e56407c2b..cd7bd37da5763 100644
--- a/mlir/test/Dialect/LLVMIR/nvvm.mlir
+++ b/mlir/test/Dialect/LLVMIR/nvvm.mlir
@@ -445,8 +445,8 @@ llvm.func private @mbarrier_arrive(%barrier: !llvm.ptr) {
 }
 
 llvm.func private @mbarrier_arrive_shared(%barrier: !llvm.ptr<3>) {
-  // CHECK:   nvvm.mbarrier.arrive.shared %{{.*}} : !llvm.ptr<3>
-  %0 = nvvm.mbarrier.arrive.shared %barrier : !llvm.ptr<3> -> i64
+  // CHECK:   nvvm.mbarrier.arrive %{{.*}} : !llvm.ptr<3>
+  %0 = nvvm.mbarrier.arrive %barrier : !llvm.ptr<3> -> i64
   llvm.return
 }
 
@@ -459,8 +459,8 @@ llvm.func private @mbarrier_arrive_nocomplete(%barrier: !llvm.ptr) {
 
 llvm.func private @mbarrier_arrive_nocomplete_shared(%barrier: !llvm.ptr<3>) {
   %count = nvvm.read.ptx.sreg.ntid.x : i32
-  // CHECK:   nvvm.mbarrier.arrive.nocomplete.shared %{{.*}} : !llvm.ptr<3>
-  %0 = nvvm.mbarrier.arrive.nocomplete.shared %barrier, %count : !llvm.ptr<3>, i32  -> i64
+  // CHECK:   nvvm.mbarrier.arrive.nocomplete %{{.*}} : !llvm.ptr<3>
+  %0 = nvvm.mbarrier.arrive.nocomplete %barrier, %count : !llvm.ptr<3>, i32  -> i64
   llvm.return
 }
 
@@ -472,8 +472,8 @@ llvm.func private @mbarrier_test_wait(%barrier: !llvm.ptr, %token : i64) -> i1 {
 
 llvm.func private @mbarrier_test_wait_shared(%barrier: !llvm.ptr<3>, %token : i64) {
   %count = nvvm.read.ptx.sreg.ntid.x : i32
-  // CHECK:   nvvm.mbarrier.test.wait.shared %{{.*}}
-  %isComplete = nvvm.mbarrier.test.wait.shared %barrier, %token : !llvm.ptr<3>, i64 -> i1
+  // CHECK:   nvvm.mbarrier.test.wait %{{.*}}
+  %isComplete = nvvm.mbarrier.test.wait %barrier, %token : !llvm.ptr<3>, i64 -> i1
   llvm.return
 }
 
diff --git a/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir
new file mode 100644
index 0000000000000..35f5e1b3c8ba2
--- /dev/null
+++ b/mlir/test/Dialect/LLVMIR/nvvm/invalid-convert-stochastic-rounding.mlir
@@ -0,0 +1,90 @@
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics
+
+// Test invalid target architecture (sm_100 instead of sm_100a)
+gpu.module @invalid_arch_sm_100 [#nvvm.target<chip = "sm_100">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0x12345678 : i32) : i32
+    // expected-error@+1 {{'nvvm.convert.f32x2.to.f16x2' op is not supported on sm_100}}
+    %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16>
+    return
+  }
+}
+
+// -----
+
+// Test that operations require stochastic rounding mode
+llvm.func @invalid_rnd_mode_f16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to f16x2.}}
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rn>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// -----
+
+llvm.func @invalid_rnd_mode_bf16x2(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // expected-error@+1 {{Only RS rounding mode is supported for conversions from f32x2 to bf16x2.}}
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {rnd = #nvvm.fp_rnd_mode<rz>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// -----
+
+// Test invalid destination types for f8x4 (should only accept f8E4M3FN, f8E5M2)
+llvm.func @invalid_dst_type_f8x4_e3m4(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}}
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E3M4)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+llvm.func @invalid_dst_type_f8x4_e8m0(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f32x4 to f8x4.}}
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E8M0FNU)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test invalid destination types for f6x4 (should only accept f6E2M3FN, f6E3M2FN)
+llvm.func @invalid_dst_type_f6x4_f8(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // expected-error@+1 {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f32x4 to f6x4.}}
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test invalid destination type for f4x4 (should only accept f4E2M1FN)
+llvm.func @invalid_dst_type_f4x4_f6(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // expected-error@+1 {{Only 'f4E2M1FN' type is supported for conversions from f32x4 to f4x4.}}
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f6E2M3FN)
+  llvm.return %res : i16
+}
+
+// -----
+
+// Test invalid rounding modes for non-stochastic ops
+llvm.func @convert_float_to_tf32_rs_not_supported(%src : f32) -> i32 {
+  // expected-error @below {{Only {rn,rz,rna} rounding modes supported for ConvertFloatToTF32Op.}}
+  %res = nvvm.convert.float.to.tf32 %src {rnd = #nvvm.fp_rnd_mode<rs>}
+  llvm.return %res : i32
+}
+
+// -----
+
+llvm.func @convert_f32x2_to_f8x2_rs_not_supported(%a : f32, %b : f32) {
+  // expected-error @below {{Only RN rounding mode is supported for conversions from f32x2 to 'f8E4M3FN' and 'f8E5M2' types}}
+  %res = nvvm.convert.f32x2.to.f8x2 %a, %b {rnd = #nvvm.fp_rnd_mode<rs>, sat = #nvvm.sat_mode<satfinite>} : i16 (f8E4M3FN)
+  llvm.return
+}
+
+// -----
+
+llvm.func @convert_bf16x2_to_f8x2_rs_not_supported(%src : vector<2xbf16>) {
+  // expected-error @below {{Only RZ and RP rounding modes are supported for conversions from bf16x2 to f8x2.}}
+  %res = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rs>} : vector<2xbf16> -> i16 (f8E8M0FNU)
+  llvm.return
+}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index e703600c71c8e..5e857599b65ea 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -664,6 +664,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK-LABEL @rocdl.global.load.async.to.lds
+  // CHECK: rocdl.global.load.async.to.lds.b8 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b32 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b64 %{{.*}}, %{{.*}}, 0, 0
+  // CHECK: rocdl.global.load.async.to.lds.b128 %{{.*}}, %{{.*}}, 0, 0
+  rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : <1>, <3>
+  rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : <1>, <3>
+  llvm.return
+}
+
 // CHECK-LABEL @rocdl.tensor.load.to.lds
 llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
                                     %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index 26b63fbe182ea..0e75894eaeceb 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -492,6 +492,15 @@ func.func @fct1(%0 : !llvm.ptr) -> () {
 
 // -----
 
+%i1 = arith.constant 1 : i32
+%i2 = arith.constant 10 : i32
+// expected-error@+1 {{unstructured acc.loop must not have induction variables}}
+acc.loop control(%iv : i32) = (%i1 : i32) to (%i2 : i32) step (%i1 : i32) {
+  acc.yield
+} attributes {independent = [#acc.device_type<none>], unstructured}
+
+// -----
+
 // expected-error@+1 {{expect at least one of num, dim or static values}}
 acc.loop gang({}) {
   "test.openacc_dummy_op"() : () -> ()
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 042ee2503cb95..df8ab9b7dd239 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -2143,6 +2143,20 @@ func.func @acc_loop_container() {
 
 // -----
 
+func.func @acc_unstructured_loop() {
+  acc.loop {
+    acc.yield
+  } attributes {independent = [#acc.device_type<none>], unstructured}
+  return
+}
+
+// CHECK-LABEL: func.func @acc_unstructured_loop
+// CHECK:       acc.loop
+// CHECK:         acc.yield
+// CHECK:       } attributes {independent = [#acc.device_type<none>], unstructured}
+
+// -----
+
 // Test private recipe with data bounds for array slicing
 acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init {
 ^bb0(%arg0: memref<10x10xf32>, %bounds0: !acc.data_bounds_ty, %bounds1: !acc.data_bounds_ty):
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index ebbe3ce0ec0d0..92f353717ac59 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -451,7 +451,7 @@ func.func @store_scatter_offset_wi_1(%src: memref<?xf16>) {
   %offsets = arith.constant dense<[0]> : vector<1xindex>
   %mask = arith.constant dense<1>: vector<1xi1>
   // expected-error@+1 {{Mask should match value except the chunk size dim}}
-  xegpu.store %val, %src[%offsets], %mask 
+  xegpu.store %val, %src[%offsets], %mask
         : vector<4xf16>, memref<?xf16>, vector<1xindex>, vector<1xi1>
   return
 }
@@ -870,14 +870,6 @@ func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) {
   return
 }
 
-// -----
-func.func @load_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  %data2 = xegpu.load_matrix %arg0[8, 8] <{subgroup_block_io}>: !xegpu.mem_desc<16x64xf16> -> vector<16x16xf16>
-  return
-}
-
-
 // -----
 func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) {
   // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}}
@@ -900,16 +892,25 @@ func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: ve
 }
 
 // -----
-func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
+func.func @simt_store_matrix_vector_nonlinear(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>, %arg1: vector<2x16xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, accessed data must be contiguous and coalesced}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>} :
+        vector<2x16xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1]>>
   return
 }
 
 // -----
-func.func @store_mem_desc_invalid_attr2(%arg0: !xegpu.mem_desc<16x64xf16>, %data: vector<16x16xf16>) {
-  // expected-error@+1 {{subgroup_block_io are only allowed when result is a 1D VectorType.}}
-  xegpu.store_matrix %data,  %arg0[8, 8] <{subgroup_block_io}>: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16>
+func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>, %arg1: vector<16x2xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, the distributed dimensions must be contiguous}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>} :
+        vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [1, 16]>>
   return
 }
 
+// -----
+func.func @simt_store_matrix_vector_noncoalesced(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>, %arg1: vector<16x2xf32>) {
+  // expected-error@+1 {{With subgroup_block_io, the block shape must match the lane layout}}
+  xegpu.store_matrix %arg1, %arg0[0, 0] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} :
+        vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [32, 1], block = [1, 17]>>
+  return
+}
diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir
index 0a10f6814ae96..9b3829664108d 100644
--- a/mlir/test/Dialect/XeGPU/ops.mlir
+++ b/mlir/test/Dialect/XeGPU/ops.mlir
@@ -278,6 +278,15 @@ gpu.func @subgroup_load_nd_offset_1(%src: memref<24x32xf32>, %x : index, %y : in
   gpu.return
 }
 
+// CHECK: func @subgroup_load_nd_offset_2(%[[arg0:.*]]: memref<24x32xf32>, %arg1: index) {
+gpu.func @subgroup_load_nd_offset_2(%src: memref<24x32xf32>, %x : index) {
+  // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0 : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+  %1 = xegpu.create_nd_tdesc %src : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
+  // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]][%arg1, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+  %2 = xegpu.load_nd %1[%x, 0] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose = array<i64: 1, 0>}> : !xegpu.tensor_desc<16x8xf32> -> vector<8x16xf32>
+  gpu.return
+}
+
 // CHECK: func @simt_load_nd_8(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @simt_load_nd_8(%src: memref<24x32xf32>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<16x8xf32>
diff --git a/mlir/test/Dialect/XeGPU/optimize-transpose.mlir b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
new file mode 100644
index 0000000000000..24a0de6ed48a5
--- /dev/null
+++ b/mlir/test/Dialect/XeGPU/optimize-transpose.mlir
@@ -0,0 +1,280 @@
+// RUN: mlir-opt --xevm-attach-target='module=xevm_* chip=pvc'  \
+// RUN:   --xegpu-optimize-block-loads --canonicalize --cse --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: gpu.func @no_scf(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xf16>, %{{.*}}: vector<8x16xf16>) -> vector<8x16xf32> {
+// CHECK:         %[[C16:.*]] = arith.constant 16 : index
+// CHECK:         %[[C32:.*]] = arith.constant 32 : index
+// CHECK:         %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xf16> -> index
+// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:         %[[BDESC:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C32]]], strides : [%[[C32]], 1] : i64
+// CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK-NEXT:    %[[B:.*]] = xegpu.load_nd %[[BDESC]][%{{.*}}, %[[C16]]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK:         %[[BITCAST:.*]] = vector.bitcast %[[B]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>} : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @no_scf(%arg0: memref<64x64xf16>, %arg1: vector<8x16xf16>) -> vector<8x16xf32> {
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+  %1 = xegpu.load_nd %0[%c0, %c32] { result_layout = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+  %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+  %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #a } : vector<8x16xf16>, vector<16x16xf16> -> vector<8x16xf32>
+  gpu.return %6 : vector<8x16xf32>
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @no_scf_i8(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: memref<64x64xi8>, %{{.*}}: vector<8x32xi8>) -> vector<8x16xi32> {
+// CHECK:         %[[C16:.*]] = arith.constant 16 : index
+// CHECK:         %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG0]] : memref<64x64xi8> -> index
+// CHECK:         %[[T0:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:         %[[T1:.*]] = xegpu.create_nd_tdesc %[[T0]], shape : [64, %[[C16]]], strides : [%[[C16]], 1] : i64
+// CHECK-SAME:      -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:         %[[T2:.*]] = xegpu.load_nd %[[T1]][%{{.*}}, %[[C16]]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:      : !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK:         %[[T3:.*]] = vector.bitcast %[[T2]]
+// CHECK-SAME:      {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>} : vector<16x8xi32> to vector<16x32xi8>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 2]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 4]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [4, 1]>
+#c = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+gpu.module @xevm_module {
+gpu.func @no_scf_i8(%arg0: memref<64x64xi8>, %arg1: vector<8x32xi8>) -> vector<8x16xi32> {
+  %c0 = arith.constant 0 : index
+  %c64 = arith.constant 64 : index
+  %0 = xegpu.create_nd_tdesc %arg0 : memref<64x64xi8> -> !xegpu.tensor_desc<16x32xi8, #b>
+  %1 = xegpu.load_nd %0[%c0, %c64] { result_layout = #b } : !xegpu.tensor_desc<16x32xi8, #b> -> vector<16x32xi8>
+  %2 = vector.transpose %1, [1, 0] { layout_result_0 = #bt } : vector<16x32xi8> to vector<32x16xi8>
+  %6 = xegpu.dpas %arg1, %2 { layout_result_0 = #c } : vector<8x32xi8>, vector<32x16xi8> -> vector<8x16xi32>
+  gpu.return %6 : vector<8x16xi32>
+}
+}
+
+
+// -----
+// CHECK-LABEL:   gpu.func @gemm_b_transpose(
+// CHECK-SAME:      %{{.*}} memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[C16:.*]] = arith.constant 16 : index
+// CHECK:           %[[C256:.*]] = arith.constant 256 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%c128, 1]
+// CHECK-SAME:        : i64 -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}} = scf.for %[[K:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
+// CHECK:             %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK-NEXT:        %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-SAME:          !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK-NEXT:        %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @gemm_b_transpose(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+  %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) {
+    %5 = xegpu.load_nd %2[%c0, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16>
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+    %7 = vector.transpose %6, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %8 : vector<8x16xf32>
+  } {layout_result_0 = #a}
+  xegpu.store_nd %4, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL: gpu.func @nested_scf(
+// CHECK-SAME:     %{{.*}}: memref<256x256xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:          %[[C128:.*]] = arith.constant 128 : index
+// CHECK:          %[[C1:.*]] = arith.constant 1 : index
+// CHECK:          %[[C16:.*]] = arith.constant 16 : index
+// CHECK:          %[[C256:.*]] = arith.constant 256 : index
+// CHECK:          scf.for %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:            %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:            %[[T3:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:            %[[T4:.*]] = xegpu.create_nd_tdesc %[[T3]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64
+// CHECK-SAME:          -> !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:            %{{.*}} = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>) {
+// CHECK:              %[[T7:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK-NEXT:         %[[T8:.*]] = xegpu.load_nd %[[T4]][%{{.*}}, %[[T7]]]  {layout_result_0 = #xegpu.layout<
+// CHECK-SAME:          lane_layout = [16, 1], lane_data = [1, 1]>} :
+// CHECK-SAME:          !xegpu.tensor_desc<16x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<16x8xi32>
+// CHECK-NEXT:         %{{.*}} = vector.bitcast %[[T8]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<16x8xi32> to vector<16x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @nested_scf(%arg0: memref<256x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c256 = arith.constant 256 : index
+  scf.for %arg8 = %c0 to %c256 step %c16 {
+    %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+    %1 = xegpu.load_nd %0[%arg8, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+    %2 = xegpu.create_nd_tdesc %arg0 : memref<256x256xf16> -> !xegpu.tensor_desc<8x16xf16, #a>
+    %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #b>
+    %4 = scf.for %arg3 = %c0 to %c256 step %c16 iter_args(%arg4 = %1) -> (vector<8x16xf32>) {
+      %5 = xegpu.load_nd %2[%arg8, %arg3] { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf16, #a> -> vector<8x16xf16>
+      %6 = xegpu.load_nd %3[%arg8, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<16x16xf16, #b> -> vector<16x16xf16>
+      %7 = vector.transpose %6, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+      %8 = xegpu.dpas %5, %7, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+      scf.yield %8 : vector<8x16xf32>
+    } {layout_result_0 = #a}
+    xegpu.store_nd %4, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  }
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:   gpu.func @large_loads(
+// CHECK-SAME:      %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %{{.*}}: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[CST:.*]] = arith.constant dense<0> : vector<32x16xi32>
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64
+// CHECK-SAME:        -> !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+// CHECK:             %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T7:.*]] = vector.insert_strided_slice %[[T6]], %[[CST]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 0], strides = [1, 1]}
+// CHECK-SAME:          : vector<32x8xi32> into vector<32x16xi32>
+// CHECK:             %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
+// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T10:.*]] = vector.insert_strided_slice %[[T9]], %[[T7]]
+// CHECK-SAME:          {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>, offsets = [0, 8], strides = [1, 1]}
+// CHECK-SAME:          : vector<32x8xi32> into vector<32x16xi32>
+// CHECK:             %{{.*}} = vector.bitcast %[[T10]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x16xi32> to vector<32x32xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @large_loads(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c32 = arith.constant 32 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16> -> !xegpu.tensor_desc<32x32xf16, #b>
+  %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1)
+    -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b } : !xegpu.tensor_desc<32x32xf16, #b> -> vector<32x32xf16>
+    %7 = vector.extract_strided_slice %6 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %8 = vector.extract_strided_slice %6 {offsets = [0, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %9 = vector.extract_strided_slice %6 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %10 = vector.extract_strided_slice %6 {offsets = [16, 16], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x32xf16> to vector<16x16xf16>
+    %11 = vector.transpose %7, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %12 = vector.transpose %8, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %13 = vector.transpose %9, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %14 = vector.transpose %10, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
+  } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a}
+  xegpu.store_nd %4#0, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#1, %0[%c0, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#2, %0[%c16, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#3, %0[%c16, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}
+
+// -----
+// CHECK-LABEL:  gpu.func @array_length(
+// CHECK-SAME:      %{{.*}}: vector<8x16xf16>, %[[ARG1:[a-zA-Z0-9]+]]: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+// CHECK:           %[[C128:.*]] = arith.constant 128 : index
+// CHECK:           %[[C8:.*]] = arith.constant 8 : index
+// CHECK:           %[[C1:.*]] = arith.constant 1 : index
+// CHECK:           %[[PTR:.*]] = memref.extract_aligned_pointer_as_index %[[ARG1]] : memref<256x256xf16> -> index
+// CHECK:           %[[T2:.*]] = arith.index_cast %[[PTR]] : index to i64
+// CHECK:           %[[T3:.*]] = xegpu.create_nd_tdesc %[[T2]], shape : [256, %[[C128]]], strides : [%[[C128]], 1] : i64 ->
+// CHECK-SAME:        !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>>
+// CHECK:           %{{.*}}:4 = scf.for %[[K:.*]] = %{{.*}} iter_args(%{{.*}}) -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+// CHECK:             %[[T5:.*]] = arith.shrui %[[K]], %[[C1]] : index
+// CHECK:             %[[T6:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T5]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T7:.*]] = vector.bitcast %[[T6]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x8xi32> to vector<32x16xf16>
+// CHECK:             %[[T8:.*]] = arith.addi %[[T5]], %[[C8]] : index
+// CHECK:             %[[T9:.*]] = xegpu.load_nd %[[T3]][%{{.*}}, %[[T8]]]  {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>}
+// CHECK-SAME:          : !xegpu.tensor_desc<32x8xi32, #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>> -> vector<32x8xi32>
+// CHECK:             %[[T10:.*]] = vector.bitcast %[[T9]] {layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>}
+// CHECK-SAME:          : vector<32x8xi32> to vector<32x16xf16>
+#a = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+#b = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+#bt = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>
+gpu.module @xevm_module {
+gpu.func @array_length(%arg0: vector<8x16xf16>, %arg1: memref<256x256xf16>, %arg2: memref<256x256xf32>) {
+  %c0 = arith.constant 0 : index
+  %c16 = arith.constant 16 : index
+  %c32 = arith.constant 32 : index
+  %c256 = arith.constant 256 : index
+  %0 = xegpu.create_nd_tdesc %arg2 : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #a>
+  %1 = xegpu.load_nd %0[%c0, %c0]  { layout_result_0 = #a } : !xegpu.tensor_desc<8x16xf32, #a> -> vector<8x16xf32>
+  %3 = xegpu.create_nd_tdesc %arg1 : memref<256x256xf16>
+    -> !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>>
+  %4:4 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %1, %arg5 = %1, %arg6 = %1, %arg7 = %1)
+    -> (vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) {
+    %6 = xegpu.load_nd %3[%c0, %arg3]  { layout_result_0 = #b }
+      : !xegpu.tensor_desc<32x16xf16, #b, #xegpu.block_tdesc_attr<array_length = 2 : i64>> -> vector<2x32x16xf16>
+    %19 = vector.extract %6[0] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
+    %20 = vector.extract %6[1] { layout_result_0 = #b } : vector<32x16xf16> from vector<2x32x16xf16>
+    %7 = vector.extract_strided_slice %19 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %8 = vector.extract_strided_slice %19 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %9 = vector.extract_strided_slice %20 {offsets = [0, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %10 = vector.extract_strided_slice %20 {offsets = [16, 0], sizes = [16, 16], strides = [1, 1], layout_result_0 = #b }
+      : vector<32x16xf16> to vector<16x16xf16>
+    %11 = vector.transpose %7, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %12 = vector.transpose %8, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %13 = vector.transpose %9, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %14 = vector.transpose %10, [1, 0]  { layout_result_0 = #bt } : vector<16x16xf16> to vector<16x16xf16>
+    %15 = xegpu.dpas %arg0, %11, %arg4 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %16 = xegpu.dpas %arg0, %12, %arg5 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %17 = xegpu.dpas %arg0, %13, %arg6 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    %18 = xegpu.dpas %arg0, %14, %arg7 {layout_result_0 = #a} : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32>
+    scf.yield %15, %16, %17, %18 : vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>
+  } {layout_result_0 = #a, layout_result_1 = #a, layout_result_2 = #a, layout_result_3 = #a}
+  xegpu.store_nd %4#0, %0[%c0, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#1, %0[%c0, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#2, %0[%c16, %c0]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  xegpu.store_nd %4#3, %0[%c16, %c16]  : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #a>
+  gpu.return
+}
+}
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index 543e119d81d88..61e315d0d2080 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -215,6 +215,46 @@ func.func @scatter_ops(%src: memref<256xf16>) {
 }
 // -----
 gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_custom_perm_layout(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_custom_perm_layout(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+  %4 = arith.addf %3, %3 : vector<16xf16>
+  xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @scatter_ops_preserve_load_perm_layout(
+// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: memref<256xf16>) {
+// CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<true> : vector<16xi1>
+// CHECK: %[[OFFSETS:.*]] = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+// CHECK: %[[LOAD_VEC:.*]] = xegpu.load %[[ARG0]][%[[OFFSETS]]], %[[MASK]] <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}>
+// CHECK-SAME: {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+// CHECK: %[[ADD_RES:.*]] = arith.addf %[[LOAD_VEC]], %[[LOAD_VEC]] {layout_result_0 = #xegpu.layout<lane_layout = [8], lane_data = [1]>} : vector<16xf16>
+// CHECK: xegpu.store %[[ADD_RES]], %[[ARG0]][%[[OFFSETS]]], %[[MASK]]
+// CHECK-SAME <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+func.func @scatter_ops_preserve_load_perm_layout(%src: memref<256xf16>) {
+  %1 = arith.constant dense<1>: vector<16xi1>
+  %offset = arith.constant dense<12> : vector<16xindex>
+  %3 = xegpu.load %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [16], lane_data = [1]>}> : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16xf16>
+  %4 = arith.addf %3, %3 : vector<16xf16>
+  xegpu.store %4, %src[%offset], %1 <{layout = #xegpu.layout<lane_layout = [8], lane_data = [1]>}> : vector<16xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+  return
+}
+}
+// -----
+gpu.module @test {
 // CHECK-LABEL: func.func @vector_bitcast_i16_to_f16(
 // CHECK:       %[[LOAD0:.*]] = xegpu.load_nd %{{.*}}  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
 // CHECK-SAME:     !xegpu.tensor_desc<8x16xi16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<8x16xi16>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 27a3dc373c739..8fd3cca5594cb 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -265,3 +265,68 @@ gpu.module @xevm_module{
     gpu.return
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_1({{.*}}) {
+// CHECK: %[[C2:.*]] = arith.constant 2 : index
+// CHECK: %[[C8:.*]] = arith.constant 8 : index
+// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
+// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C8]]
+// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C8]]
+// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C2]]
+// CHECK: %[[REMU3:.*]] = index.remu %[[REMU2]], %[[C2]]
+// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C8]]
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[REMU4]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<1x1xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[REMU4]]] : vector<1x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_1(%arg0: !xegpu.mem_desc<32x32xf32>) {
+    %c0 = arith.constant 0 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x8xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [2, 8], lane_data = [1, 1]>}> : vector<2x8xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_2({{.*}}) {
+// CHECK: %[[C8:.*]] = arith.constant 8 : index
+// CHECK: %[[C2:.*]] = arith.constant 2 : index
+// CHECK: %[[C4:.*]] = arith.constant 4 : index
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: %[[LANE_ID:.*]] = gpu.lane_id
+// CHECK: %[[REMU1:.*]] = index.remu %[[LANE_ID]], %[[C4]]
+// CHECK: %[[DIVU:.*]] = index.divu %[[LANE_ID]], %[[C4]]
+// CHECK: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C4]]
+// CHECK: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2]]
+// CHECK: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C8]]
+// CHECK: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C4]]
+// CHECK: %[[ADD:.*]] = index.add %[[REMU4]], %[[C1]]
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%[[REMU3]], %[[ADD]]] : !xegpu.mem_desc<32x32xf32>, index, index -> vector<2x1xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%[[REMU3]], %[[ADD]]] : vector<2x1xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_2(%arg0: !xegpu.mem_desc<32x32xf32>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x4xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c1] <{layout = #xegpu.layout<lane_layout = [4, 4], lane_data = [2, 1]>}> : vector<8x4xf32>, !xegpu.mem_desc<32x32xf32>, index, index
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @load_store_matrix_3({{.*}}) {
+// CHECK: %[[MAT:.*]] = xegpu.load_matrix %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
+// CHECK-SAME: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index -> vector<1x2xf32>
+// CHECK: xegpu.store_matrix %[[MAT]], %arg0[%{{.*}}, %{{.*}}] <{subgroup_block_io}>:
+// CHECK-SAME: vector<1x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<block = [16, 1], stride = [1, 32]>>, index, index
+gpu.module @xevm_module{
+  gpu.func @load_store_matrix_3(%arg0: !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %1 = xegpu.load_matrix %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+      !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index -> vector<16x2xf32>
+    xegpu.store_matrix %1, %arg0[%c0, %c1] {subgroup_block_io, layout = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 1]>} :
+      vector<16x2xf32>, !xegpu.mem_desc<32x32xf32, #xegpu.mem_layout<stride = [1, 32], block = [16, 1]>>, index, index
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
index b73bc69393dab..02c5f71d5c83d 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-attr-interface.mlir
@@ -1,33 +1,32 @@
 // RUN: mlir-opt --test-xegpu-layout-interface --cse -split-input-file %s | FileCheck %s
 
-//CHECk: #map = affine_map<()[s0] -> (s0 floordiv 8)>
 gpu.module @test {
   gpu.func @slice_attr() -> vector<128xindex> {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
-    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C8:.*]]
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU]], %[[C4:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]]
+    // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]]
+    // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
+    // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
     %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
     gpu.return %step : vector<128xindex>
   }
 
   gpu.func @nested_slice_attr() -> vector<128xindex> {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IDY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LOCALY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
-    //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[DIVU1:.*]] = index.divu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[DIVU2:.*]] = index.divu %[[DIVU1]], %[[C8:.*]]
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[DIVU2]], %[[C4:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU]], %[[C32:.*]]
+    // CHECK-DAG: %[[MOD:.*]] = index.remu %[[MUL]], %[[C128:.*]]
+    // CHECK-DAG: %[[BASE:.*]] = vector.step : vector<32xindex>
+    // CHECK-DAG: %[[CAST:.*]] = vector.broadcast %[[MOD]] : index to vector<32xindex>
+    // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[BASE]], %[[CAST]] : vector<32xindex>
     %0 = vector.step {layout_result_0 = #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 1], sg_data = [32, 32, 1]>, dims = [2]>, dims = [1]>} : vector<128xindex>
     gpu.return %0 : vector<128xindex>
   }
 
-}
\ No newline at end of file
+}
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
index 09df1e4da43e2..9580769d37313 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-elemwise.mlir
@@ -166,14 +166,12 @@ gpu.module @test_elementwise_ops {
     %load_b = xegpu.load_nd %tdesc_b
       : !xegpu.tensor_desc<24x32xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>>
       -> vector<24x32xf32>
-    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-COUNT-12: arith.negf {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
     // CHECK-NOT: arith.negf
     %negf = arith.negf %load_a
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
       : vector<24x32xf32>
-    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-12: : vector<2x2xf32>
+    // CHECK-COUNT-12: math.powf {{.*}}, {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [2, 2], lane_data = [1, 1]>} : vector<2x2xf32>
     // CHECK-NOT: math.powf
     %powf = math.powf %load_a, %load_b
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2], lane_layout = [2, 2], lane_data = [1, 1]>}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
index d2d250cbe0f66..01134d8eaabec 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
@@ -1,14 +1,10 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-#map = affine_map<()[s0] -> (s0 floordiv 4)>
-#map1 = affine_map<()[s0] -> (s0 mod 4)>
-
 gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: create_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
-      // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32>
-      // CHECK-SAME: -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
       // CHECK-NOT: xegpu.create_nd_tdesc
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -16,22 +12,23 @@ gpu.module @test_round_robin_assignment {
     }
 
   // CHECK-LABEL: create_nd_tdesc_with_shared_data
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc_with_shared_data(%src: memref<256x128xf32>) {
-    //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[IdY:%.+]] = affine.apply #map()[[[sgId]]]
-    //CHECK: [[IdX:%.+]] = affine.apply #map1()[[[sgId]]]
-    //CHECK: [[C16:%.+]] = arith.constant 16 : index
-    //CHECK: [[LY:%.+]] = index.mul [[IdY]], [[C16]]
-    //CHECK: [[C64:%.+]] = arith.constant 64 : index
-    //CHECK: [[LX:%.+]] = index.mul [[IdX]], [[C64]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[offY:%.+]] = index.remu [[LY]], [[C128]]
-    //CHECK: [[C64_2:%.+]] = arith.constant 64 : index
-    //CHECK: [[offX:%.+]] = index.remu [[LX]], [[C64_2]]
-    //CHECK: xegpu.create_nd_tdesc [[ARG_0]][[[offY]], [[offX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
+    // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK: %[[C4:.*]] = arith.constant 4 : index
+    // CHECK: %[[IDX:.*]] = index.remu %[[SGID]], %[[C4]]
+    // CHECK: %[[IDY_DIV:.*]] = index.divu %[[SGID]], %[[C4]]
+    // CHECK: %[[C8:.*]] = arith.constant 8 : index
+    // CHECK: %[[IDY:.*]] = index.remu %[[IDY_DIV]], %[[C8]]
+    // CHECK: %[[C16:.*]] = arith.constant 16 : index
+    // CHECK: %[[LY:.*]] = index.mul %[[IDY]], %[[C16]]
+    // CHECK: %[[C64:.*]] = arith.constant 64 : index
+    // CHECK: %[[LX:.*]] = index.mul %[[IDX]], %[[C64]]
+    // CHECK: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK: %[[OFFY:.*]] = index.remu %[[LY]], %[[C128]]
+    // CHECK: %[[C64_1:.*]] = arith.constant 64 : index
+    // CHECK: %[[OFFX:.*]] = index.remu %[[LX]], %[[C64_1]]
+    // CHECK: xegpu.create_nd_tdesc %[[ARG_0]][%[[OFFY]], %[[OFFX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<16x64xf32>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 64]>>
     gpu.return
@@ -42,9 +39,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-COUNT-4: xegpu.load_nd %{{.*}}
-      // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<2x2xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-SAME-COUNT-4: -> vector<16x16xf32>
+      // CHECK-COUNT-4: xegpu.load_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
       // CHECK-NOT: xegpu.load_nd
       %load =  xegpu.load_nd %tdesc
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -57,9 +52,8 @@ gpu.module @test_round_robin_assignment {
   gpu.func @store_nd(%src: memref<256x128xf32>) {
       %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
         -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}}
-      // CHECK-SAME-COUNT-4: : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-      // CHECK-NOT : xegpu.store_nd
+      // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, %{{.*}} : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+      // CHECK-NOT: xegpu.store_nd
       %load = xegpu.load_nd %tdesc
         : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
         -> vector<256x128xf32>
@@ -73,8 +67,7 @@ gpu.module @test_round_robin_assignment {
   gpu.func @update_nd(%src: memref<256x128xf32>){
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       ->  !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>>
+    // CHECK-COUNT-4: xegpu.update_nd_offset %{{.*}}, [0, 16] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.update_nd_offset
     %update = xegpu.update_nd_offset %tdesc, [0, 16]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -84,15 +77,9 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
-    // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]][%{{.*}}, %{{.*}}] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]][%{{.*}}, %{{.*}}] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -113,8 +100,7 @@ gpu.module @test_round_robin_assignment {
   // CHECK-LABEL: prefetch_nd_tdesc
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}}
-    // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.prefetch_nd %{{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -131,9 +117,7 @@ gpu.module @test_round_robin_assignment {
     %load =  xegpu.load_nd %tdesc
       : !xegpu.tensor_desc<128x1xf32, #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 1], lane_layout = [8, 1], lane_data = [1, 1]>>
       -> vector<128x1xf32>
-    // CHECK-COUNT-2: vector.broadcast {{.*}}
-    // CHECK-SAME-COUNT-2: {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-2: : vector<16x1xf32> to vector<16x32xf32>
+    // CHECK-COUNT-2: vector.broadcast {{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [8, 1], lane_data = [1, 1]>} : vector<16x1xf32> to vector<16x32xf32>
     // CHECK-NOT: vector.broadcast
     %broadcast = vector.broadcast %load
       {layout_result_0 = #xegpu.layout<sg_layout = [4, 1], sg_data = [16, 32], lane_layout = [8, 1], lane_data = [1, 1]>}
@@ -171,10 +155,10 @@ gpu.module @test_round_robin_assignment {
     %0 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
     %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
-    //CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
+    // CHECK: scf.while ({{.*}}) : (vector<16xf32>, vector<16xf32>, i32) -> (vector<16xf32>, vector<16xf32>, i32)
     %3:2 = scf.while (%arg2 = %1, %arg3 = %c0_i32) : (vector<256xf32>, i32) -> (vector<256xf32>, i32) {
       %4 = arith.cmpi slt, %arg3, %c10_i32 : i32
-      //CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
+      // CHECK: scf.condition{{.*}} : vector<16xf32>, vector<16xf32>, i32
       scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
     } do {
     // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: vector<16xf32>, [[arg4:%.+]]: i32)
@@ -195,16 +179,16 @@ gpu.module @test_round_robin_assignment {
     %2 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     %3 = arith.cmpi eq, %0, %c10 : index
     // CHECK-LABEL: scf.if
-    //  CHECK-SAME: (vector<16xf32>, vector<16xf32>)
+    // CHECK-SAME: (vector<16xf32>, vector<16xf32>)
     %4 = scf.if %3 -> (vector<256xf32>) {
       %5 = xegpu.load_nd %1  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
     } else {
       %5 = xegpu.load_nd %2  : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>> -> vector<256xf32>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: vector<16xf32>, vector<16xf32>
+      // CHECK-SAME: vector<16xf32>, vector<16xf32>
       scf.yield %5 : vector<256xf32>
     } {layout_result_0 = #xegpu.layout<sg_layout = [8], sg_data = [16]>}
     xegpu.store_nd %4, %1  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -220,16 +204,16 @@ gpu.module @test_round_robin_assignment {
 
     %0 = arith.cmpi eq, %id, %c10 : index
     // CHECK-LABEL: scf.if
-    //  CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
+    // CHECK-SAME: (!xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>)
     %1 = scf.if %0 -> (!xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>) {
       %2 = xegpu.create_nd_tdesc %arg0[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
       scf.yield %2 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     } else {
       %3 = xegpu.create_nd_tdesc %arg1[0] : memref<1024xf32> -> !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
       // CHECK-LABEL: scf.yield
-      //  CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
+      // CHECK-SAME: !xegpu.tensor_desc<16xf32>, !xegpu.tensor_desc<16xf32>
       scf.yield %3 : !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
     }
     xegpu.store_nd %d, %1 : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [8], sg_data = [16]>>
@@ -238,8 +222,8 @@ gpu.module @test_round_robin_assignment {
 
   gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
     %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
-    //CHECK-2: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
-    //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
+    // CHECK-COUNT-2: xegpu.load_nd {{.*}}  : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
+    // CHECK-COUNT-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
     %1 = xegpu.load_nd %0  : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
     %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
                                    target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
index 86a021b66949c..84ce80f477a55 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -14,13 +14,11 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: load_nd_tdesc_with_offset
   gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-SAME-COUNT-4: -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.load_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
-    %load =  xegpu.load_nd %tdesc[0, 0]
+    %load = xegpu.load_nd %tdesc[0, 0]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
       -> vector<256x128xf32>
     gpu.return
@@ -28,8 +26,7 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: store_nd_with_offset
   gpu.func @store_nd_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}] : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.store_nd
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -42,10 +39,8 @@ gpu.module @test_distribution {
   }
 
   // CHECK-LABEL: prefetch_nd_tdesc_with_offset
-  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}]
-    // CHECK-SAME-COUNT-4: !xegpu.tensor_desc<256x128xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.prefetch_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     // CHECK-NOT: xegpu.prefetch_nd
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -57,15 +52,11 @@ gpu.module @test_distribution {
   // CHECK-LABEL: dpas
   // CHECK-SAME: (%[[ARG_0:.*]]: memref<256x128xf16>, %[[ARG_1:.*]]: memref<128x256xf16>)
   gpu.func @dpas(%a: memref<256x128xf16>, %b: memref<128x256xf16>) {
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16>
-    // CHECK-SAME-COUNT-4: -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [4, 8], lane_data = [1, 1]>>
-    // CHECK-NOT: xegpu.create_nd_tdesc
-    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}}
-    // CHECK-SAME-COUNT-16: {layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
-    // CHECK-SAME-COUNT-16: : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_0]] : memref<256x128xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    // CHECK-COUNT-4: xegpu.create_nd_tdesc %[[ARG_1]] : memref<128x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>>
+    // CHECK-COUNT-4: xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>> -> vector<16x16xf16>
+    // CHECK-COUNT-16: xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     // CHECK-NOT: xegpu.dpas
     %tdesc_a = xegpu.create_nd_tdesc %a : memref<256x128xf16>
       -> !xegpu.tensor_desc<256x128xf16, #xegpu.layout<sg_layout = [8, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -102,27 +93,42 @@ gpu.module @test_distribution {
   gpu.func @non_splat_constant() {
     // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}}> : vector<2x1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[MAP4:.*]] = affine.apply #map4()[%[[SGID]]]
-    // CHECK-DAG: %[[MAP5:.*]] = affine.apply #map5()[%[[SGID]]]
-    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[MAP4]], %[[C2:.*]]
-    // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[MUL]], %[[C32:.*]]
-    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[MAP5]], %[[C1:.*]]
+    // CHECK-DAG: %[[REMU1:.*]] = index.remu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C1:.*]]
+    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MUL:.*]] = index.mul %[[REMU2]], %[[C2:.*]]
+    // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[MUL]], %[[C32:.*]]
+    // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[REMU1]], %[[C1:.*]]
     // CHECK-DAG: %[[ADD16:.*]] = arith.addi %[[MUL]], %[[C16:.*]] : index
-    // CHECK-DAG: %[[REMU3:.*]] = index.remu %[[ADD16]], %[[C32:.*]]
-    // CHECK-DAG: %[[REMU4:.*]] = index.remu %[[MAP5]], %[[C1:.*]]
-    // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU1]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[REMU5:.*]] = index.remu %[[ADD16]], %[[C32:.*]]
+    // CHECK-DAG: %[[REMU6:.*]] = index.remu %[[REMU1]], %[[C1:.*]]
+    // CHECK-DAG: %[[STRIDE1:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[STRIDE1]] : index
-    // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU2]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[STRIDE2:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES1:.*]] = arith.addi %[[ADDSTRIDES]], %[[STRIDE2]] : index
     // CHECK-DAG: %[[BCAST1:.*]] = vector.broadcast %[[ADDSTRIDES1]] : index to vector<2x1xindex>
     // CHECK-DAG: %[[RESULT1:.*]] = arith.addi %[[BASECST]], %[[BCAST1]] : vector<2x1xindex>
-    // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU3]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[STRIDE3:.*]] = arith.muli %[[REMU5]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES2:.*]] = arith.addi %[[C0:.*]], %[[STRIDE3]] : index
-    // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU4]], %[[C0:.*]] : index
+    // CHECK-DAG: %[[STRIDE4:.*]] = arith.muli %[[REMU6]], %[[C0:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES3:.*]] = arith.addi %[[ADDSTRIDES2]], %[[STRIDE4]] : index
     // CHECK-DAG: %[[BCAST2:.*]] = vector.broadcast %[[ADDSTRIDES3]] : index to vector<2x1xindex>
     // CHECK-DAG: %[[RESULT2:.*]] = arith.addi %[[BASECST]], %[[BCAST2]] : vector<2x1xindex>
     %cst_2 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [8, 1], sg_data = [2, 1]>} dense<[[0], [16], [32], [48], [64], [80], [96], [112], [128], [144], [160], [176], [192], [208], [224], [240], [256], [272], [288], [304], [320], [336], [352], [368], [384], [400], [416], [432], [448], [464], [480], [496]]> : vector<32x1xindex>
     gpu.return
   }
+
+  // CHECK-LABEL: vector_transpose
+  gpu.func @vector_transpose(%src: memref<256x128xf32>) {
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
+        -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+    %load = xegpu.load_nd %tdesc[0, 0]
+        : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 16], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        -> vector<256x128xf32>
+    // CHECK-COUNT-2: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<32x16xf32> to vector<16x32xf32>
+    // CHECK-NOT: vector.transpose
+    %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [4, 8], sg_data = [16, 32], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x128xf32> to vector<128x256xf32>
+      gpu.return
+  }
 }
+
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index 52acde4dffc2e..4fbb566cfbe73 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -1,8 +1,5 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
-//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
-//CHECK: #map2 = affine_map<()[s0] -> (s0 floordiv 8)>
 gpu.module @test_distribution {
   // CHECK-LABEL: create_nd_tdesc_no_offset
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
@@ -26,13 +23,23 @@ gpu.module @test_distribution {
   }
 
   // CHECK-LABEL: load_nd_tdesc_with_offset
-  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @load_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: %[[LOAD:.*]] = xegpu.load_nd {{%.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
-    %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
+    //CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %{{.*}} : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    //CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    //CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+    //CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %[[C4]]
+    //CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %[[C4]]
+    //CHECK-DAG: %[[C8:.*]] = arith.constant 8 : index
+    //CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %[[C8]]
+    //CHECK-DAG: %[[C32:.*]] = arith.constant 32 : index
+    //CHECK-DAG: %[[L_OFF_Y:.*]] = index.mul %[[SGIDY]], %[[C32]]
+    //CHECK-DAG: %[[L_OFF_X:.*]] = index.mul %[[SGIDX]], %[[C32]]
+    //CHECK-DAG: %[[C256:.*]] = arith.constant 256 : index
+    //CHECK-DAG: %[[OFF_Y:.*]] = index.remu %[[L_OFF_Y]], %[[C256]]
+    //CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
+    //CHECK-DAG: %[[OFF_X:.*]] = index.remu %[[L_OFF_X]], %[[C128]]
+    //CHECK-DAG: %[[LOAD:.*]] = xegpu.load_nd %[[TDESC]][{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<32x32xf32>
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     %load =  xegpu.load_nd %tdesc[0, 0]
       : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -43,9 +50,6 @@ gpu.module @test_distribution {
   // CHECK-LABEL: store_nd_with_offsets
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @store_nd_with_offsets(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
     //CHECK: xegpu.store_nd %{{.*}}, {{%.*}}[{{%.*}}, {{%.*}}]  : vector<32x32xf32>, !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src: memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
@@ -60,9 +64,6 @@ gpu.module @test_distribution {
   // CHECK-LABEL: prefetch_nd_tdesc_with_offset
   // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @prefetch_nd_tdesc_with_offset(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
     //CHECK: xegpu.prefetch_nd %{{.*}}[{{%.*}}, {{%.*}}] : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %cst0 = arith.constant 0 : index
     %tdesc = xegpu.create_nd_tdesc %src : memref<256x128xf32>
@@ -285,7 +286,7 @@ gpu.module @test_distribution {
     // CHECK: %[[VAL:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<2.550000e+01> : vector<8xf16>
     // CHECK: %[[CST:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<0> : vector<8xindex>
     // CHECK: %[[MASK:.*]] = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [8]>} dense<true> : vector<8xi1>
-    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}>
+    // CHECK: xegpu.store %[[VAL]], %[[ARG0]][%[[CST]]], %[[MASK]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>, layout = #xegpu.layout<inst_data = [8]>}>
     // CHECK-SAME: {layout_operand_0 = #xegpu.layout<inst_data = [8]>, layout_operand_2 = #xegpu.layout<inst_data = [8]>,
     // CHECK-SAME: layout_operand_3 = #xegpu.layout<inst_data = [8]>}
     // CHECK-SAME: : vector<8xf16>, memref<256xf16>, vector<8xindex>, vector<8xi1>
@@ -319,17 +320,15 @@ gpu.module @test_distribution {
   gpu.func @distribute_load_matrix(%arg0: memref<32768xi8, 3>) {
     //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[c2:%.+]] = arith.constant 2 : index
     //CHECK: [[c4:%.+]] = arith.constant 4 : index
-    //CHECK: [[c4_0:%.+]] = arith.constant 4 : index
-    //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
-    //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]]
+    //CHECK: [[c2:%.+]] = arith.constant 2 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
-    //CHECK: [[c32_1:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c0_1:%.+]] = arith.constant 0 : index
+    //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c32_0:%.+]] = arith.constant 32 : index
+    //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]]
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -346,17 +345,15 @@ gpu.module @test_distribution {
     //CHECK: [[cst:%.+]] = arith.constant dense<1.000000e+00> : vector<32x32xf32>
     //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[arg0]] : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32>
     //CHECK: [[sgid:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[c2:%.+]] = arith.constant 2 : index
     //CHECK: [[c4:%.+]] = arith.constant 4 : index
-    //CHECK: [[c4_0:%.+]] = arith.constant 4 : index
-    //CHECK: [[id_y:%.+]] = affine.apply #map()[[[sgid]]]
-    //CHECK: [[id_x:%.+]] = affine.apply #map1()[[[sgid]]]
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgid]], [[c4]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgid]], [[c4]]
+    //CHECK: [[c2:%.+]] = arith.constant 2 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c2]]
     //CHECK: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_y:%.+]] = index.mul [[id_y]], [[c32]]
-    //CHECK: [[c32_1:%.+]] = arith.constant 32 : index
-    //CHECK: [[l_off_x:%.+]] = index.mul [[id_x]], [[c32_1]]
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c0_2:%.+]] = arith.constant 0 : index
+    //CHECK: [[l_off_y:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c32_0:%.+]] = arith.constant 32 : index
+    //CHECK: [[l_off_x:%.+]] = index.mul [[sgidx]], [[c32_0]]
     //CHECK: [[c64:%.+]] = arith.constant 64 : index
     //CHECK: [[off_y:%.+]] = index.remu [[l_off_y]], [[c64]]
     //CHECK: [[c128:%.+]] = arith.constant 128 : index
@@ -411,14 +408,17 @@ gpu.module @test_distribution {
   // CHECK-LABEL: vector_step_op
   gpu.func @vector_step_op_slice_attr() {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK-DAG: [[IDY:%.+]] = affine.apply #map2()[[[sgId]]]
-    //CHECK-DAG: [[c32:%.+]] = arith.constant 32 : index
-    //CHECK-DAG: [[LY:%.+]] = index.mul [[IDY]], [[c32]]
-    //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
-    //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<32xindex>
-    //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
+    //CHECK: [[c8:%.+]] = arith.constant 8 : index
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c8]]
+    //CHECK: [[sgidy_tmp:%.+]] = index.divu [[sgId]], [[c8]]
+    //CHECK: [[c4:%.+]] = arith.constant 4 : index
+    //CHECK: [[sgidy:%.+]] = index.remu [[sgidy_tmp]], [[c4]]
+    //CHECK: [[c32:%.+]] = arith.constant 32 : index
+    //CHECK: [[LY:%.+]] = index.mul [[sgidy]], [[c32]]
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<32xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<32xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<32xindex>
     %step = vector.step {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [4, 8], sg_data = [32, 32]>, dims = [1]>}: vector<128xindex>
     gpu.return
@@ -426,14 +426,14 @@ gpu.module @test_distribution {
 
   gpu.func @vector_step_op_layout_attr() {
     //CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
-    //CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
-    //CHECK-DAG: [[c8:%.+]] = arith.constant 8 : index
-    //CHECK-DAG: [[LOCALY:%.+]] = index.mul [[sgId]], [[c8]]
-    //CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK-DAG: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK-DAG: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
-    //CHECK-DAG: [[BASE:%.+]] = vector.step : vector<8xindex>
-    //CHECK-DAG: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
+    //CHECK: [[c16:%.+]] = arith.constant 16 : index
+    //CHECK: [[sgidx:%.+]] = index.remu [[sgId]], [[c16]]
+    //CHECK: [[c8:%.+]] = arith.constant 8 : index
+    //CHECK: [[LOCALY:%.+]] = index.mul [[sgidx]], [[c8]]
+    //CHECK: [[c128:%.+]] = arith.constant 128 : index
+    //CHECK: [[MODY:%.+]] = index.remu [[LOCALY]], [[c128]]
+    //CHECK: [[BASE:%.+]] = vector.step : vector<8xindex>
+    //CHECK: [[CAST:%.+]] = vector.broadcast [[MODY]] : index to vector<8xindex>
     //CHECK: [[ADD:%.+]] = arith.addi [[BASE]], [[CAST]] : vector<8xindex>
     %step = vector.step {layout_result_0 = #xegpu.layout<sg_layout = [16], sg_data = [8]>}: vector<128xindex>
     gpu.return
@@ -464,14 +464,27 @@ gpu.module @test_distribution {
     gpu.return
   }
 
+  // CHECK-LABEL: vector_transpose
+  gpu.func @vector_transpose(%src: memref<256x32xf32>) {
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x32xf32>
+        -> !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+    %load = xegpu.load_nd %tdesc[0, 0]
+        : !xegpu.tensor_desc<256x32xf32, #xegpu.layout<sg_layout = [4, 8], sg_data = [64, 32], lane_layout = [16, 1], lane_data = [1, 1], order =[0, 1]>>
+        -> vector<256x32xf32>
+    //CHECK: vector.transpose {{.*}}, [1, 0] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<64x32xf32> to vector<32x64xf32>
+    %trans = vector.transpose %load, [1, 0] {layout_result_0 = #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 64], lane_layout = [1, 16], lane_data = [1, 1], order =[1, 0]>} : vector<256x32xf32> to vector<32x256xf32>
+      gpu.return
+  }
+
   // CHECK-LABEL: non_splat_constant_2D
   gpu.func @non_splat_constant_2D() {
     // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1x1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: affine.apply #map4()[%[[SGID]]]
-    // CHECK-DAG: affine.apply #map5()[%[[SGID]]]
-    // CHECK-DAG: %[[IDY:.*]] = index.remu %{{.*}}, %[[C32:.*]]
-    // CHECK-DAG: %[[IDX:.*]] = index.remu %{{.*}}, %[[C1:.*]]
+    // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}}
+    // CHECK-DAG: %[[IDY:.*]] = index.remu %[[SGIDY]], %{{.*}}
+    // CHECK-DAG: %[[IDX:.*]] = index.remu %[[SGIDX]], %{{.*}}
     // CHECK-DAG: %[[STRIDECOL:.*]] = arith.muli %[[IDY]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[STRIDECOL]] : index
     // CHECK-DAG: %[[STRIDEROW:.*]] = arith.muli %[[IDX]], %[[C0:.*]] : index
@@ -484,20 +497,19 @@ gpu.module @test_distribution {
 
   // CHECK-LABEL: non_splat_constant_2D_non_unit_dim
   gpu.func @non_splat_constant_2D_non_unit_dim() {
-    // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{.*}} : vector<2x2xindex>
+    // CHECK-DAG: %[[BASECST:.*]] = arith.constant dense<{{\[}}{{\[}}0, 16{{\]}}, {{\[}}8, 24{{\]}}{{\]}}> : vector<2x2xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[IDY:.*]] = affine.apply #map()[%[[SGID]]]
-    // CHECK-DAG: %[[IDX:.*]] = affine.apply #map1()[%[[SGID]]]
-    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[IDY]], %[[C2:.*]]
-    // CHECK-DAG: %[[C2_2:.*]] = arith.constant 2 : index
-    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[IDX]], %[[C2:.*]]
+    // CHECK-DAG: %[[SGIDX:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY_TMP:.*]] = index.divu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[SGIDY:.*]] = index.remu %[[SGIDY_TMP]], %{{.*}}
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[SGIDY]], %[[C2:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[SGIDX]], %{{.*}}
     // CHECK-DAG: %[[REMU_Y:.*]] = index.remu %[[MULY]], %[[C8:.*]]
-    // CHECK-DAG: %[[C8_2:.*]] = arith.constant 8 : index
-    // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %[[C8:.*]]
-    // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %[[C8:.*]] : index
+    // CHECK-DAG: %[[REMU_X:.*]] = index.remu %[[MULX]], %{{.*}}
+    // CHECK-DAG: %[[MUL5:.*]] = arith.muli %[[REMU_Y]], %{{.*}} : index
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[C0:.*]], %[[MUL5]] : index
     // CHECK-DAG: %[[MUL6:.*]] = arith.muli %[[REMU_X]], %[[C16:.*]] : index
-    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi  %[[ADD]], %[[MUL6]] : index
+    // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[ADD]], %[[MUL6]] : index
     // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<2x2xindex>
     // CHECK-DAG: %[[ADDCST:.*]] = arith.addi %[[BASECST]], %[[BCAST]] : vector<2x2xindex>
     %cst_8x8 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [4, 4], sg_data = [2, 2]>} dense<[
@@ -517,13 +529,14 @@ gpu.module @test_distribution {
   gpu.func @non_splat_constant() {
     // CHECK-DAG: %[[CST:.*]] = arith.constant dense<0> : vector<1xindex>
     // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
-    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %[[C32:.*]]
-    // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU]], %[[C16:.*]] : index
+    // CHECK-DAG: %[[REMU:.*]] = index.remu %[[SGID]], %{{.*}}
+    // CHECK-DAG: %[[REMU2:.*]] = index.remu %[[REMU]], %{{.*}}
+    // CHECK-DAG: %[[MUL:.*]] = arith.muli %[[REMU2]], %[[C16:.*]] : index
     // CHECK-DAG: %[[ADDSTRIDES:.*]] = arith.addi %[[C0:.*]], %[[MUL]] : index
     // CHECK-DAG: %[[BCAST:.*]] = vector.broadcast %[[ADDSTRIDES]] : index to vector<1xindex>
     // CHECK-DAG: %[[ADD:.*]] = arith.addi %[[CST]], %[[BCAST]] : vector<1xindex>
     %cst = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32], sg_data = [1]>} dense<[0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496]> : vector<32xindex>
-    // CHECK: arith.constant dense<{{\[}}[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]{{\]}}> : vector<1x16xindex>
+    // CHECK: arith.constant dense<{{\[}}{{\[}}0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15{{\]}}{{\]}}> : vector<1x16xindex>
     %cst_1 = arith.constant {layout_result_0 = #xegpu.layout<sg_layout = [32, 1], sg_data = [1, 16]>} dense<[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]]> : vector<1x16xindex>
     gpu.return
   }
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
index e83229e3a3995..5ce3d1d0fb5d6 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir
@@ -1,47 +1,35 @@
 // RUN: mlir-opt --xegpu-wg-to-sg-distribute -split-input-file %s | FileCheck %s
 
-//CHECK: #map = affine_map<()[s0] -> (s0 floordiv 4)>
-//CHECK: #map1 = affine_map<()[s0] -> (s0 mod 4)>
 gpu.module @test_1_1_assignment {
   // CHECK-LABEL: create_nd_tdesc
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
   gpu.func @create_nd_tdesc(%src: memref<256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: [[C32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
-    //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_1:%.+]] = arith.constant 0 : index
-    //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[Y:%.+]] = index.remu [[LY]], [[C256]]
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[X:%.+]] = index.remu [[LX]], [[C128]]
-    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][[[Y]], [[X]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]]
+    // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]]
+    // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]]
+    // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][%[[MODY]], %[[MODX]]] : memref<256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
   }
 
   // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref
-  // CHECK-SAME: [[ARG_0:%.*]]: memref<3x256x128xf32>
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<3x256x128xf32>
   gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) {
-    //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index
-    //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]]
-    //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]]
-    //CHECK: [[C32:%.+]] = arith.constant 32 : index
-    //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]]
-    //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]]
-    //CHECK: [[C0:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_2:%.+]] = arith.constant 0 : index
-    //CHECK: [[C256:%.+]] = arith.constant 256 : index
-    //CHECK: [[MODY:%.+]] = index.remu [[LY]], [[C256]]
-    //CHECK: [[C128:%.+]] = arith.constant 128 : index
-    //CHECK: [[MODX:%.+]] = index.remu [[LX]], [[C128]]
-    //CHECK: [[C0_3:%.+]] = arith.constant 0 : index
-    //CHECK: [[C0_4:%.+]] = arith.constant 0 : index
-    //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[MODY]], [[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK-DAG: %[[SGID:.*]] = gpu.subgroup_id : index
+    // CHECK-DAG: %[[REMUX:.*]] = index.remu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[DIVU:.*]] = index.divu %[[SGID]], %[[C4:.*]]
+    // CHECK-DAG: %[[REMUY:.*]] = index.remu %[[DIVU]], %[[C8:.*]]
+    // CHECK-DAG: %[[MULY:.*]] = index.mul %[[REMUY]], %[[C32:.*]]
+    // CHECK-DAG: %[[MULX:.*]] = index.mul %[[REMUX]], %[[C32:.*]]
+    // CHECK-DAG: %[[MODY:.*]] = index.remu %[[MULY]], %[[C256:.*]]
+    // CHECK-DAG: %[[MODX:.*]] = index.remu %[[MULX]], %[[C128:.*]]
+    // CHECK-DAG: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][1, %[[MODY]], %[[MODX]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
     %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32>
       -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
@@ -81,25 +69,24 @@ gpu.module @test_1_1_assignment {
     xegpu.store_nd %load, %tdesc
       : vector<256x128xf32>, !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
     gpu.return
-}
+  }
 
-// CHECK-LABEL: update_nd
-// CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
-gpu.func @update_nd(%src: memref<256x128xf32>){
-  // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
-  // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
-  // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
-  %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
-    -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-  %update = xegpu.update_nd_offset %tdesc, [0, 16]
-    : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
-  gpu.return
-}
+  // CHECK-LABEL: update_nd
+  // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32>
+  gpu.func @update_nd(%src: memref<256x128xf32>){
+    // CHECK: %[[TDESC:.*]] = xegpu.create_nd_tdesc %[[ARG_0]][{{%.*}}, {{%.*}}] : memref<256x128xf32>
+    // CHECK-SAME: -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK: %[[UPDATE:.*]] = xegpu.update_nd_offset %[[TDESC]], [0, 16]
+    // CHECK-SAME: : !xegpu.tensor_desc<32x32xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %tdesc = xegpu.create_nd_tdesc %src[0, 0] : memref<256x128xf32>
+      -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+    %update = xegpu.update_nd_offset %tdesc, [0, 16]
+      : !xegpu.tensor_desc<256x128xf32, #xegpu.layout<sg_layout = [8, 4], sg_data = [32, 32], lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
 
-// CHECK-LABEL: dpas
-gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
+  // CHECK-LABEL: dpas
+  gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128], lane_layout = [1, 16], lane_data = [1, 1]>>
     %load_a =  xegpu.load_nd %tdesc_a
@@ -110,16 +97,15 @@ gpu.func @dpas(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %load_b =  xegpu.load_nd %tdesc_b
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16], lane_layout = [1, 16], lane_data = [2, 1]>>
       -> vector<128x128xf16>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x128xf16>, vector<128x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
       {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
     gpu.return
   }
 
-
-// CHECK-LABEL: dpas_no_sg_data
-gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
-    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
+  // CHECK-LABEL: dpas_no_sg_data
+  gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<128x128xf16>
       -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1],
       order = [1, 0]>>
@@ -134,6 +120,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
       : !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [2, 1],
       order = [1, 0]>>
       -> vector<128x128xf16>
+    // CHECK: %[[DPAS:.*]] = xegpu.dpas %{{.*}}, %{{.*}} {layout_result_0 =  #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>} : vector<16x16xf16>, vector<16x16xf16> -> vector<16x16xf32>
     %dpas = xegpu.dpas %load_a, %load_b
       {layout_result_0 =  #xegpu.layout<sg_layout = [8, 8], lane_layout = [1, 16], lane_data = [1, 1], order = [1, 0]>}
       : vector<128x128xf16>, vector<128x128xf16> -> vector<128x128xf32>
@@ -196,9 +183,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
   }
 
   gpu.func @scf_for(%arg0: memref<1024x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<1024x1024xf32>) {
-    //CHECK: [[c0:%.+]] = arith.constant 0 : index
-    //CHECK: [[c128:%.+]] = arith.constant 128 : index
-    //CHECK: [[c1024:%.+]] = arith.constant 1024 : index
+    // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+    // CHECK-DAG: %[[C128:.*]] = arith.constant 128 : index
+    // CHECK-DAG: %[[C1024:.*]] = arith.constant 1024 : index
     %c0 = arith.constant 0 : index
     %c128 = arith.constant 128 : index
     %c1024 = arith.constant 1024 : index
@@ -211,15 +198,15 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %4 = xegpu.create_nd_tdesc %arg0[%0, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>
     %5 = xegpu.create_nd_tdesc %arg1[%c0, %1] : memref<1024x1024xf16> -> !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>
 
-    //      CHECK: [[scf:%.+]]:3 = scf.for [[arg3:%.+]] = [[c0]] to [[c1024]] step [[c128]]
-    // CHECK-SAME: iter_args([[arg4:%.+]] = {{.*}}, [[arg5:%.+]] = {{.*}}, [[arg6:%.+]] = {{.*}}) ->
+    // CHECK: %[[SCF:.*]]:3 = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C1024]] step %[[C128]]
+    // CHECK-SAME: iter_args(%[[ARG4:.*]] = {{.*}}, %[[ARG5:.*]] = {{.*}}, %[[ARG6:.*]] = {{.*}}) ->
     // CHECK-SAME: (!xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>)
-    //      CHECK: [[a:%.+]] = xegpu.load_nd [[arg4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
-    //      CHECK: [[b:%.+]] = xegpu.load_nd [[arg5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
-    //      CHECK: [[c:%.+]] = xegpu.dpas [[a]], [[b]], [[arg6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
-    //      CHECK: [[at:%.+]] = xegpu.update_nd_offset [[arg4]], [[[c0]], [[c128]]] : !xegpu.tensor_desc<16x128xf16>
-    //      CHECK: [[bt:%.+]] = xegpu.update_nd_offset [[arg5]], [[[c128]], [[c0]]] : !xegpu.tensor_desc<128x16xf16>
-    //      CHECK: scf.yield [[at]], [[bt]], [[c]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
+    // CHECK: %[[A:.*]] = xegpu.load_nd %[[ARG4]] : !xegpu.tensor_desc<16x128xf16> -> vector<16x128xf16>
+    // CHECK: %[[B:.*]] = xegpu.load_nd %[[ARG5]] : !xegpu.tensor_desc<128x16xf16> -> vector<128x16xf16>
+    // CHECK: %[[C:.*]] = xegpu.dpas %[[A]], %[[B]], %[[ARG6]] : vector<16x128xf16>, vector<128x16xf16>, vector<16x16xf32> -> vector<16x16xf32>
+    // CHECK: %[[AT:.*]] = xegpu.update_nd_offset %[[ARG4]], [%[[C0]], %[[C128]]] : !xegpu.tensor_desc<16x128xf16>
+    // CHECK: %[[BT:.*]] = xegpu.update_nd_offset %[[ARG5]], [%[[C128]], %[[C0]]] : !xegpu.tensor_desc<128x16xf16>
+    // CHECK: scf.yield %[[AT]], %[[BT]], %[[C]] : !xegpu.tensor_desc<16x128xf16>, !xegpu.tensor_desc<128x16xf16>, vector<16x16xf32>
     %6:3 = scf.for %arg3 = %c0 to %c1024 step %c128 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3)
         -> (!xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [16, 128]>>,
             !xegpu.tensor_desc<128x128xf16, #xegpu.layout<sg_layout = [8, 8], sg_data = [128, 16]>>, vector<128x128xf32>) {
@@ -252,7 +239,7 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
       // CHECK: scf.condition{{.*}} : vector<16xf32>, i32
       scf.condition(%4) %arg2, %arg3 : vector<256xf32>, i32
     } do {
-    // CHECK: ([[arg2:%.+]]: vector<16xf32>, [[arg3:%.+]]: i32)
+    // CHECK: (%[[ARG2:.*]]: vector<16xf32>, %[[ARG3:.*]]: i32)
     ^bb0(%arg2: vector<256xf32>, %arg3: i32):
       xegpu.store_nd %arg2, %2  : vector<256xf32>, !xegpu.tensor_desc<256xf32, #xegpu.layout<sg_layout = [16], sg_data = [16]>>
       %4 = arith.addi %arg3, %c1_i32 : i32
@@ -344,9 +331,9 @@ gpu.func @dpas_no_sg_data(%a: memref<128x128xf16>, %b: memref<128x128xf16>) {
     %cond4 = arith.cmpi slt, %sg_id, %c31 : index
     %cond5 = arith.andi %cond3, %cond4 : i1
     scf.if %cond5 {
-      // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
-      // CHECK: %[[C2:.*]] = arith.constant 2 : index
-      // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
+        // CHECK: %[[SGID:.*]] = gpu.subgroup_id : index
+        // CHECK: %[[C2:.*]] = arith.constant 2 : index
+        // CHECK: %[[SUB:.*]] = index.sub %{{.*}}, %[[C2]]
       %tdesc = xegpu.create_nd_tdesc %src2[0, 0] : memref<128x64xf32>
         -> !xegpu.tensor_desc<128x64xf32, #xegpu.layout<sg_layout = [4, 4], sg_data = [32, 16], lane_layout = [8, 4], lane_data = [1, 1]>>
       %load =  xegpu.load_nd %tdesc
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 60bd24a27868e..1e4cf8d4589cb 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -1308,7 +1308,7 @@ llvm.func @experimental_constrained_fpext(%s: f32, %v: vector<4xf32>) {
 // CHECK-DAG: declare float @llvm.cos.f32(float)
 // CHECK-DAG: declare <8 x float> @llvm.cos.v8f32(<8 x float>) #0
 // CHECK-DAG: declare { float, float } @llvm.sincos.f32(float)
-// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>) #0
+// CHECK-DAG: declare { <8 x float>, <8 x float> } @llvm.sincos.v8f32(<8 x float>)
 // CHECK-DAG: declare float @llvm.copysign.f32(float, float)
 // CHECK-DAG: declare float @llvm.rint.f32(float)
 // CHECK-DAG: declare double @llvm.rint.f64(double)
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir
new file mode 100644
index 0000000000000..b5bb22350dcd7
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_stochastic_rounding.mlir
@@ -0,0 +1,182 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// -----
+
+// Test valid architectures work
+
+// Valid case on sm_100a
+gpu.module @valid_f16x2_rs_sm_100a [#nvvm.target<chip = "sm_100a">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0x12345678 : i32) : i32
+    %res = nvvm.convert.f32x2.to.f16x2 %f1, %f2, %rbits : vector<2xf16>
+    return
+  }
+}
+
+// Valid case on sm_103a
+gpu.module @valid_bf16x2_rs_sm_103a [#nvvm.target<chip = "sm_103a">] {
+  func.func @convert_rs() {
+    %f1 = llvm.mlir.constant(1.0 : f32) : f32
+    %f2 = llvm.mlir.constant(2.0 : f32) : f32
+    %rbits = llvm.mlir.constant(0 : i32) : i32
+    %res = nvvm.convert.f32x2.to.bf16x2 %f1, %f2, %rbits : vector<2xbf16>
+    return
+  }
+}
+
+// -----
+
+// Test F32x2 -> F16x2 with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs
+llvm.func @convert_f32x2_to_f16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB,  %rbits : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_satfinite
+llvm.func @convert_f32x2_to_f16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu
+llvm.func @convert_f32x2_to_f16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_f16x2_rs_relu_satfinite
+llvm.func @convert_f32x2_to_f16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xf16> {
+  // CHECK: %{{.*}} = call <2 x half> @llvm.nvvm.ff2f16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.f16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xf16>
+  llvm.return %res : vector<2xf16>
+}
+
+// -----
+
+// Test F32x2 -> BF16x2 with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs
+llvm.func @convert_f32x2_to_bf16x2_rs(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB,  %rbits : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_satfinite
+llvm.func @convert_f32x2_to_bf16x2_rs_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu
+llvm.func @convert_f32x2_to_bf16x2_rs_relu(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// CHECK-LABEL: @convert_f32x2_to_bf16x2_rs_relu_satfinite
+llvm.func @convert_f32x2_to_bf16x2_rs_relu_satfinite(%srcA : f32, %srcB : f32, %rbits : i32) -> vector<2xbf16> {
+  // CHECK: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ff2bf16x2.rs.relu.satfinite(float %{{.*}}, float %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x2.to.bf16x2 %srcA, %srcB, %rbits {relu = true, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16>
+  llvm.return %res : vector<2xbf16>
+}
+
+// -----
+
+// Test F32x4 -> F8x4 (E4M3) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs
+llvm.func @convert_f32x4_to_f8x4_e4m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e4m3_rs_relu
+llvm.func @convert_f32x4_to_f8x4_e4m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e4m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E4M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F8x4 (E5M2) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs
+llvm.func @convert_f32x4_to_f8x4_e5m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f8E5M2)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f8x4_e5m2_rs_relu
+llvm.func @convert_f32x4_to_f8x4_e5m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e5m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f8x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f8E5M2)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F6x4 (E2M3) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs
+llvm.func @convert_f32x4_to_f6x4_e2m3_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E2M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e2m3_rs_relu
+llvm.func @convert_f32x4_to_f6x4_e2m3_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e2m3x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E2M3FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F6x4 (E3M2) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs
+llvm.func @convert_f32x4_to_f6x4_e3m2_rs(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits : vector<4xf32> -> vector<4xi8> (f6E3M2FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f6x4_e3m2_rs_relu
+llvm.func @convert_f32x4_to_f6x4_e3m2_rs_relu(%src : vector<4xf32>, %rbits : i32) -> vector<4xi8> {
+  // CHECK: %{{.*}} = call <4 x i8> @llvm.nvvm.f32x4.to.e3m2x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f6x4 %src, %rbits {relu = true} : vector<4xf32> -> vector<4xi8> (f6E3M2FN)
+  llvm.return %res : vector<4xi8>
+}
+
+// -----
+
+// Test F32x4 -> F4x4 (E2M1) with stochastic rounding (.rs)
+
+// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs
+llvm.func @convert_f32x4_to_f4x4_e2m1_rs(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits : vector<4xf32> -> i16 (f4E2M1FN)
+  llvm.return %res : i16
+}
+
+// CHECK-LABEL: @convert_f32x4_to_f4x4_e2m1_rs_relu
+llvm.func @convert_f32x4_to_f4x4_e2m1_rs_relu(%src : vector<4xf32>, %rbits : i32) -> i16 {
+  // CHECK: %{{.*}} = call i16 @llvm.nvvm.f32x4.to.e2m1x4.rs.relu.satfinite(<4 x float> %{{.*}}, i32 %{{.*}})
+  %res = nvvm.convert.f32x4.to.f4x4 %src, %rbits {relu = true} : vector<4xf32> -> i16 (f4E2M1FN)
+  llvm.return %res : i16
+}
+
diff --git a/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
new file mode 100644
index 0000000000000..a8a743006fbf8
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/redux-sync-invalid.mlir
@@ -0,0 +1,49 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @redux_sync_i32_with_abs(%value: i32, %offset: i32) {
+  // expected-error@+1 {{abs attribute is supported only for f32 type}}
+  %res = nvvm.redux.sync add %value, %offset {abs = true}: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_i32_with_nan(%value: i32, %offset: i32) {
+  // expected-error@+1 {{nan attribute is supported only for f32 type}}
+  %res = nvvm.redux.sync add %value, %offset {nan = true}: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_f32_with_invalid_kind_add(%value: f32, %offset: i32) {
+  // expected-error@+1 {{'add' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}}
+  %res = nvvm.redux.sync add %value, %offset: f32 -> f32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_f32_with_invalid_kind_and(%value: f32, %offset: i32) {
+  // expected-error@+1 {{'and' redux kind unsupported with 'f32' type. Only supported type is 'i32'.}}
+  %res = nvvm.redux.sync and %value, %offset: f32 -> f32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_i32_with_invalid_kind_fmin(%value: i32, %offset: i32) {
+  // expected-error@+1 {{'fmin' redux kind unsupported with 'i32' type. Only supported type is 'f32'.}}
+  %res = nvvm.redux.sync fmin %value, %offset: i32 -> i32
+  llvm.return
+}
+
+// -----
+
+llvm.func @redux_sync_non_matching_types(%value: i32, %offset: i32) {
+  // expected-error@+1 {{failed to verify that all of {res, val} have same type}}
+  %res = nvvm.redux.sync add %value, %offset: i32 -> f32
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir
new file mode 100644
index 0000000000000..f2ccfe71a3f23
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/shfl-sync-invalid.mlir
@@ -0,0 +1,22 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+func.func @nvvm_invalid_shfl_pred(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{"return_value_and_is_valid" attribute must be specified when the return type is a struct type}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> !llvm.struct<(f32, i1)>
+}
+
+// -----
+
+func.func @nvvm_invalid_shfl_invalid_return_type_1(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{expected return type to be of type 'f32' but got 'i32' instead}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 : f32 -> i32
+}
+
+// -----
+
+func.func @nvvm_invalid_shfl_invalid_return_type_2(%arg0 : i32, %arg1 : f32, %arg2 : i32, %arg3 : i32) {
+  // expected-error@+1 {{expected first element in the returned struct to be of type 'f32' but got 'i32' instead}}
+  %0 = nvvm.shfl.sync bfly %arg0, %arg1, %arg2, %arg3 {return_value_and_is_valid} : f32 -> !llvm.struct<(i32, i1)>
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir
new file mode 100644
index 0000000000000..1b93f20c15b99
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/nvvm/tcgen05-ld-invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: mlir-translate -verify-diagnostics -split-input-file -mlir-to-llvmir %s
+
+// -----
+
+llvm.func @nvvm_tcgen05_ld_32x32b_offset(%tmemAddr : !llvm.ptr<6>, %offset : i64) -> () {
+  // expected-error@+1 {{offset argument is only supported for shape 16x32bx2}}
+  %ldv2 = nvvm.tcgen05.ld %tmemAddr, %offset { pack, shape = #nvvm.tcgen05_ldst_shape<shape_32x32b>} : vector<2 x i32>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 9115de65ff0e8..3fc09f371a347 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -538,9 +538,9 @@ llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p
   // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc(ptr %{{.*}})
   nvvm.cp.async.mbarrier.arrive %bar_gen {noinc = true} : !llvm.ptr
   // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared : !llvm.ptr<3>
+  nvvm.cp.async.mbarrier.arrive %bar_shared : !llvm.ptr<3>
   // CHECK: call void @llvm.nvvm.cp.async.mbarrier.arrive.noinc.shared(ptr addrspace(3) %{{.*}})
-  nvvm.cp.async.mbarrier.arrive.shared %bar_shared {noinc = true} : !llvm.ptr<3>
+  nvvm.cp.async.mbarrier.arrive %bar_shared {noinc = true} : !llvm.ptr<3>
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 8a848221a50dd..3fbd9e0567948 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -1040,6 +1040,19 @@ llvm.func @rocdl.global.load.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
   llvm.return
 }
 
+// CHECK-LABEL: rocdl.global.load.async.to.lds
+llvm.func @rocdl.global.load.async.to.lds(%src : !llvm.ptr<1>, %dst: !llvm.ptr<3>) {
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b8
+  rocdl.global.load.async.to.lds.b8 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b32
+  rocdl.global.load.async.to.lds.b32 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b64
+  rocdl.global.load.async.to.lds.b64 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  // CHECK: call void @llvm.amdgcn.global.load.async.to.lds.b128
+  rocdl.global.load.async.to.lds.b128 %src, %dst, 0, 0 : !llvm.ptr<1>, !llvm.ptr<3>
+  llvm.return
+}
+
 // CHECK-LABEL: rocdl.tensor.load.to.lds
 llvm.func @rocdl.tensor.load.to.lds(%dgroup0 : vector<4xi32>, %dgroup1 : vector<8xi32>,
                                     %dgroup2 : vector<4xi32>, %dgroup3 : vector<4xi32>) {
diff --git a/mlir/test/Target/SPIRV/group-ops.mlir b/mlir/test/Target/SPIRV/group-ops.mlir
index cf519cba961c5..6f19b3553dd37 100644
--- a/mlir/test/Target/SPIRV/group-ops.mlir
+++ b/mlir/test/Target/SPIRV/group-ops.mlir
@@ -1,11 +1,13 @@
-// RUN: mlir-translate -no-implicit-module -test-spirv-roundtrip -split-input-file %s | FileCheck %s
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip --split-input-file %s | FileCheck %s
 
 // RUN: %if spirv-tools %{ rm -rf %t %}
 // RUN: %if spirv-tools %{ mkdir %t %}
 // RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --split-input-file --spirv-save-validation-files-with-prefix=%t/module %s %}
 // RUN: %if spirv-tools %{ spirv-val %t %}
 
-spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, SubgroupBallotKHR, Groups, SubgroupBufferBlockIOINTEL, GroupNonUniformArithmetic, GroupUniformArithmeticKHR], [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_INTEL_subgroups, SPV_KHR_uniform_group_instructions]> {
+spirv.module Logical GLSL450 requires #spirv.vce<v1.3,
+  [Shader, Linkage, SubgroupBallotKHR, Groups, GroupNonUniformArithmetic, GroupUniformArithmeticKHR],
+  [SPV_KHR_storage_buffer_storage_class, SPV_KHR_shader_ballot, SPV_KHR_uniform_group_instructions]> {
   // CHECK-LABEL: @subgroup_ballot
   spirv.func @subgroup_ballot(%predicate: i1) -> vector<4xi32> "None" {
     // CHECK: %{{.*}} = spirv.KHR.SubgroupBallot %{{.*}}: vector<4xi32>
@@ -24,30 +26,6 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.3, [Shader, Linkage, Subgrou
     %0 = spirv.GroupBroadcast <Workgroup> %value, %localid : f32, vector<3xi32>
     spirv.ReturnValue %0: f32
   }
-  // CHECK-LABEL: @subgroup_block_read_intel
-  spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32
-    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32
-    spirv.ReturnValue %0: i32
-  }
-  // CHECK-LABEL: @subgroup_block_read_intel_vector
-  spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
-    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
-    spirv.ReturnValue %0: vector<3xi32>
-  }
-  // CHECK-LABEL: @subgroup_block_write_intel
-  spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32
-    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32
-    spirv.Return
-  }
-  // CHECK-LABEL: @subgroup_block_write_intel_vector
-  spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" {
-    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32>
-    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32>
-    spirv.Return
-  }
   // CHECK-LABEL: @group_iadd
   spirv.func @group_iadd(%value: i32) -> i32 "None" {
     // CHECK: spirv.GroupIAdd <Workgroup> <Reduce> %{{.*}} : i32
diff --git a/mlir/test/Target/SPIRV/subgroup-block-intel.mlir b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir
new file mode 100644
index 0000000000000..14060e632fffd
--- /dev/null
+++ b/mlir/test/Target/SPIRV/subgroup-block-intel.mlir
@@ -0,0 +1,34 @@
+// RUN: mlir-translate --no-implicit-module --test-spirv-roundtrip %s | FileCheck %s
+
+// RUN: %if spirv-tools %{ rm -rf %t %}
+// RUN: %if spirv-tools %{ mkdir %t %}
+// RUN: %if spirv-tools %{ mlir-translate --no-implicit-module --serialize-spirv --spirv-save-validation-files-with-prefix=%t/module %s %}
+// RUN: %if spirv-tools %{ spirv-val %t %}
+
+spirv.module Physical64 GLSL450 requires #spirv.vce<v1.3, [Addresses, Shader, Linkage, SubgroupBufferBlockIOINTEL],
+                                                          [SPV_KHR_storage_buffer_storage_class, SPV_INTEL_subgroups]> {
+  // CHECK-LABEL: @subgroup_block_read_intel
+  spirv.func @subgroup_block_read_intel(%ptr : !spirv.ptr<i32, StorageBuffer>) -> i32 "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> i32
+    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> i32
+    spirv.ReturnValue %0: i32
+  }
+  // CHECK-LABEL: @subgroup_block_read_intel_vector
+  spirv.func @subgroup_block_read_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>) -> vector<3xi32> "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockRead %{{.*}} : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
+    %0 = spirv.INTEL.SubgroupBlockRead %ptr : !spirv.ptr<i32, StorageBuffer> -> vector<3xi32>
+    spirv.ReturnValue %0: vector<3xi32>
+  }
+  // CHECK-LABEL: @subgroup_block_write_intel
+  spirv.func @subgroup_block_write_intel(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: i32) -> () "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : i32
+    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : i32
+    spirv.Return
+  }
+  // CHECK-LABEL: @subgroup_block_write_intel_vector
+  spirv.func @subgroup_block_write_intel_vector(%ptr : !spirv.ptr<i32, StorageBuffer>, %value: vector<3xi32>) -> () "None" {
+    // CHECK: spirv.INTEL.SubgroupBlockWrite %{{.*}}, %{{.*}} : vector<3xi32>
+    spirv.INTEL.SubgroupBlockWrite "StorageBuffer" %ptr, %value : vector<3xi32>
+    spirv.Return
+  }
+}
diff --git a/mlir/test/Transforms/test-legalizer-full.mlir b/mlir/test/Transforms/test-legalizer-full.mlir
index 42cec68b9fbbb..8da9109a32762 100644
--- a/mlir/test/Transforms/test-legalizer-full.mlir
+++ b/mlir/test/Transforms/test-legalizer-full.mlir
@@ -72,3 +72,21 @@ builtin.module {
   }
 
 }
+
+// -----
+
+// The region of "test.post_order_legalization" is converted before the op.
+
+// expected-remark@+1 {{applyFullConversion failed}}
+builtin.module {
+func.func @test_preorder_legalization() {
+  // expected-error@+1 {{failed to legalize operation 'test.post_order_legalization'}}
+  "test.post_order_legalization"() ({
+  ^bb0(%arg0: i64):
+    // Not-explicitly-legal ops are not allowed to survive.
+    "test.remaining_consumer"(%arg0) : (i64) -> ()
+    "test.invalid"(%arg0) : (i64) -> ()
+  }) : () -> ()
+  return
+}
+}
diff --git a/mlir/test/Transforms/test-legalizer-no-materializations.mlir b/mlir/test/Transforms/test-legalizer-no-materializations.mlir
new file mode 100644
index 0000000000000..82dd7422b22b2
--- /dev/null
+++ b/mlir/test/Transforms/test-legalizer-no-materializations.mlir
@@ -0,0 +1,67 @@
+// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND
+
+// CHECK-LABEL: func @dropped_input_in_use
+// CHECK-KIND-LABEL: func @dropped_input_in_use
+func.func @dropped_input_in_use(%arg: i16, %arg2: i64) {
+  // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16
+  // CHECK-NEXT: "work"(%[[cast]]) : (i16)
+  // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"}
+  // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16)
+  // expected-remark@+1 {{op 'work' is not legalizable}}
+  "work"(%arg) : (i16) -> ()
+}
+
+// -----
+
+// CHECK-KIND-LABEL: func @test_lookup_without_converter
+//       CHECK-KIND:   %[[producer:.*]] = "test.valid_producer"() : () -> i16
+//       CHECK-KIND:   %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"}
+//       CHECK-KIND:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
+//       CHECK-KIND:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
+func.func @test_lookup_without_converter() {
+  %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64)
+  "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> ()
+  // Make sure that the second "replace_with_valid_consumer" lowering does not
+  // lookup the materialization that was created for the above op.
+  "test.replace_with_valid_consumer"(%0) : (i64) -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @remap_moved_region_args
+func.func @remap_moved_region_args() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32):
+  // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16
+  // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64
+  // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64
+  // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32
+  // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32)
+  "test.region"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) : () -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @remap_cloned_region_args
+func.func @remap_cloned_region_args() {
+  // CHECK-NEXT: return
+  // CHECK-NEXT: ^bb1(%[[arg0:.*]]: i64, %[[arg1:.*]]: i16, %[[arg2:.*]]: i64, %[[arg3:.*]]: f32):
+  // CHECK-NEXT: %[[cast1:.*]]:2 = builtin.unrealized_conversion_cast %[[arg3]] : f32 to f16, f16
+  // CHECK-NEXT: %[[cast2:.*]] = builtin.unrealized_conversion_cast %[[arg2]] : i64 to f64
+  // CHECK-NEXT: %[[cast3:.*]] = builtin.unrealized_conversion_cast %[[arg0]] : i64 to f64
+  // CHECK-NEXT: %[[cast4:.*]] = "test.cast"(%[[cast1]]#0, %[[cast1]]#1) : (f16, f16) -> f32
+  // CHECK-NEXT: "test.valid"(%[[cast3]], %[[cast2]], %[[cast4]]) : (f64, f64, f32)
+  "test.region"() ({
+    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
+      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
+  }) {legalizer.should_clone} : () -> ()
+  // expected-remark@+1 {{op 'func.return' is not legalizable}}
+  return
+}
diff --git a/mlir/test/Transforms/test-legalizer-rollback.mlir b/mlir/test/Transforms/test-legalizer-rollback.mlir
index 71e11782e14b0..4bcca6b7e5228 100644
--- a/mlir/test/Transforms/test-legalizer-rollback.mlir
+++ b/mlir/test/Transforms/test-legalizer-rollback.mlir
@@ -163,3 +163,22 @@ func.func @create_unregistered_op_in_pattern() -> i32 {
   "test.return"(%0) : (i32) -> ()
 }
 }
+
+// -----
+
+// CHECK-LABEL: func @test_failed_preorder_legalization
+//       CHECK:   "test.post_order_legalization"() ({
+//       CHECK:     %[[r:.*]] = "test.illegal_op_g"() : () -> i32
+//       CHECK:     "test.return"(%[[r]]) : (i32) -> ()
+//       CHECK:   }) : () -> ()
+// expected-remark @+1 {{applyPartialConversion failed}}
+module {
+func.func @test_failed_preorder_legalization() {
+  // expected-error @+1 {{failed to legalize operation 'test.post_order_legalization' that was explicitly marked illegal}}
+  "test.post_order_legalization"() ({
+    %0 = "test.illegal_op_g"() : () -> (i32)
+    "test.return"(%0) : (i32) -> ()
+  }) : () -> ()
+  return
+}
+}
diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir
index 94c5bb4e93b06..88a71cc26ab0c 100644
--- a/mlir/test/Transforms/test-legalizer.mlir
+++ b/mlir/test/Transforms/test-legalizer.mlir
@@ -1,7 +1,6 @@
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics %s | FileCheck %s
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics -profile-actions-to=- %s | FileCheck %s --check-prefix=CHECK-PROFILER
 // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0" -verify-diagnostics %s | FileCheck %s
-// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND
 
 // CHECK-PROFILER: "name": "pass-execution", "cat": "PERF", "ph": "B"
 // CHECK-PROFILER: "name": "apply-conversion", "cat": "PERF", "ph": "B"
@@ -146,36 +145,6 @@ func.func @no_remap_nested() {
 
 // -----
 
-// CHECK-LABEL: func @remap_moved_region_args
-func.func @remap_moved_region_args() {
-  // CHECK-NEXT: return
-  // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16):
-  // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32
-  // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32)
-  "test.region"() ({
-    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
-      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
-  }) : () -> ()
-  // expected-remark@+1 {{op 'func.return' is not legalizable}}
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @remap_cloned_region_args
-func.func @remap_cloned_region_args() {
-  // CHECK-NEXT: return
-  // CHECK-NEXT: ^bb1(%{{.*}}: f64, %{{.*}}: f64, %{{.*}}: f16, %{{.*}}: f16):
-  // CHECK-NEXT: "test.cast"{{.*}} : (f16, f16) -> f32
-  // CHECK-NEXT: "test.valid"{{.*}} : (f64, f64, f32)
-  "test.region"() ({
-    ^bb1(%i0: i64, %unused: i16, %i1: i64, %2: f32):
-      "test.invalid"(%i0, %i1, %2) : (i64, i64, f32) -> ()
-  }) {legalizer.should_clone} : () -> ()
-  // expected-remark@+1 {{op 'func.return' is not legalizable}}
-  return
-}
-
 // CHECK-LABEL: func @remap_drop_region
 func.func @remap_drop_region() {
   // CHECK-NEXT: return
@@ -191,12 +160,9 @@ func.func @remap_drop_region() {
 // -----
 
 // CHECK-LABEL: func @dropped_input_in_use
-// CHECK-KIND-LABEL: func @dropped_input_in_use
 func.func @dropped_input_in_use(%arg: i16, %arg2: i64) {
   // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16
   // CHECK-NEXT: "work"(%[[cast]]) : (i16)
-  // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"}
-  // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16)
   // expected-remark@+1 {{op 'work' is not legalizable}}
   "work"(%arg) : (i16) -> ()
 }
@@ -452,11 +418,6 @@ func.func @test_multiple_1_to_n_replacement() {
 //       CHECK:   %[[cast:.*]] = "test.cast"(%[[producer]]) : (i16) -> f64
 //       CHECK:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
 //       CHECK:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
-// CHECK-KIND-LABEL: func @test_lookup_without_converter
-//       CHECK-KIND:   %[[producer:.*]] = "test.valid_producer"() : () -> i16
-//       CHECK-KIND:   %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"}
-//       CHECK-KIND:   "test.valid_consumer"(%[[cast]]) : (f64) -> ()
-//       CHECK-KIND:   "test.valid_consumer"(%[[producer]]) : (i16) -> ()
 func.func @test_lookup_without_converter() {
   %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64)
   "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> ()
@@ -487,3 +448,35 @@ func.func @test_working_1to1_pattern(%arg0: f16) {
   "test.type_consumer"(%arg0) : (f16) -> ()
   "test.return"() : () -> ()
 }
+
+// -----
+
+// The region of "test.post_order_legalization" is converted before the op.
+
+// CHECK: notifyBlockInserted into test.post_order_legalization: was unlinked
+// CHECK: notifyOperationInserted: test.invalid
+// CHECK: notifyBlockErased
+// CHECK: notifyOperationInserted: test.valid, was unlinked
+// CHECK: notifyOperationReplaced: test.invalid
+// CHECK: notifyOperationErased: test.invalid
+// CHECK: notifyOperationModified: test.post_order_legalization
+
+// CHECK-LABEL: func @test_preorder_legalization
+//       CHECK:   "test.post_order_legalization"() ({
+//       CHECK:   ^{{.*}}(%[[arg0:.*]]: f64):
+// Note: The survival of a not-explicitly-invalid operation does *not* cause
+// a conversion failure in when applying a partial conversion.
+//       CHECK:     %[[cast:.*]] = "test.cast"(%[[arg0]]) : (f64) -> i64
+//       CHECK:     "test.remaining_consumer"(%[[cast]]) : (i64) -> ()
+//       CHECK:     "test.valid"(%[[arg0]]) : (f64) -> ()
+//       CHECK:   }) {is_legal} : () -> ()
+func.func @test_preorder_legalization() {
+  "test.post_order_legalization"() ({
+  ^bb0(%arg0: i64):
+    // expected-remark @+1 {{'test.remaining_consumer' is not legalizable}}
+    "test.remaining_consumer"(%arg0) : (i64) -> ()
+    "test.invalid"(%arg0) : (i64) -> ()
+  }) : () -> ()
+  // expected-remark @+1 {{'func.return' is not legalizable}}
+  return
+}
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index fd2b943ff1296..9b64bc691588d 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -1418,6 +1418,22 @@ class TestTypeConsumerOpPattern
   }
 };
 
+class TestPostOrderLegalization : public ConversionPattern {
+public:
+  TestPostOrderLegalization(MLIRContext *ctx, const TypeConverter &converter)
+      : ConversionPattern(converter, "test.post_order_legalization", 1, ctx) {}
+  LogicalResult
+  matchAndRewrite(Operation *op, ArrayRef<ValueRange> operands,
+                  ConversionPatternRewriter &rewriter) const final {
+    for (Region &r : op->getRegions())
+      if (failed(rewriter.legalize(&r)))
+        return failure();
+    rewriter.modifyOpInPlace(
+        op, [&]() { op->setAttr("is_legal", rewriter.getUnitAttr()); });
+    return success();
+  }
+};
+
 /// Test unambiguous overload resolution of replaceOpWithMultiple. This
 /// function is just to trigger compiler errors. It is never executed.
 [[maybe_unused]] void testReplaceOpWithMultipleOverloads(
@@ -1532,7 +1548,8 @@ struct TestLegalizePatternDriver
     patterns.add<TestDropOpSignatureConversion, TestDropAndReplaceInvalidOp,
                  TestPassthroughInvalidOp, TestMultiple1ToNReplacement,
                  TestValueReplace, TestReplaceWithValidConsumer,
-                 TestTypeConsumerOpPattern>(&getContext(), converter);
+                 TestTypeConsumerOpPattern, TestPostOrderLegalization>(
+        &getContext(), converter);
     patterns.add<TestConvertBlockArgs>(converter, &getContext());
     mlir::populateAnyFunctionOpInterfaceTypeConversionPattern(patterns,
                                                               converter);
@@ -1553,14 +1570,16 @@ struct TestLegalizePatternDriver
                            [](Type type) { return type.isF32(); });
     });
     target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-      return converter.isSignatureLegal(op.getFunctionType()) &&
-             converter.isLegal(&op.getBody());
+      return converter.isSignatureLegal(op.getFunctionType());
     });
     target.addDynamicallyLegalOp<func::CallOp>(
         [&](func::CallOp op) { return converter.isLegal(op); });
     target.addDynamicallyLegalOp(
         OperationName("test.value_replace", &getContext()),
         [](Operation *op) { return op->hasAttr("is_legal"); });
+    target.addDynamicallyLegalOp(
+        OperationName("test.post_order_legalization", &getContext()),
+        [](Operation *op) { return op->hasAttr("is_legal"); });
 
     // TestCreateUnregisteredOp creates `arith.constant` operation,
     // which was not added to target intentionally to test
@@ -2156,8 +2175,7 @@ struct TestTypeConversionDriver
               recursiveType.getName() == "outer_converted_type");
     });
     target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp op) {
-      return converter.isSignatureLegal(op.getFunctionType()) &&
-             converter.isLegal(&op.getBody());
+      return converter.isSignatureLegal(op.getFunctionType());
     });
     target.addDynamicallyLegalOp<TestCastOp>([&](TestCastOp op) {
       // Allow casts from F64 to F32.
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index ea20597231d58..9859bd06cb526 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -470,4 +470,11 @@ def TestMemrefType : Test_Type<"TestMemref",
   }];
 }
 
+// Test implementation of an interface with methods specifying a
+// method body
+def TestBaseBody : Test_Type<"TestBaseBody",
+    [DeclareTypeInterfaceMethods<TestBaseTypeInterfacePrintTypeA>]> {
+  let mnemonic = "test_base_body";
+}
+
 #endif // TEST_TYPEDEFS
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
index 76d461108b296..93d51441f5b81 100644
--- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
+++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp
@@ -200,7 +200,8 @@ class TestStepOpPattern : public OpConversionPattern<vector::StepOp> {
 
     Value sgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-    auto maybeOffsets = sliceAttr.getOffsets(rewriter, loc, sgId, wgShape);
+    auto maybeOffsets =
+        sliceAttr.computeDistributedCoords(rewriter, loc, sgId, wgShape);
     if (failed(maybeOffsets))
       return failure();
 
diff --git a/mlir/test/mlir-tblgen/cpp-class-comments.td b/mlir/test/mlir-tblgen/cpp-class-comments.td
index 9dcf975e45286..0d3445d6647af 100644
--- a/mlir/test/mlir-tblgen/cpp-class-comments.td
+++ b/mlir/test/mlir-tblgen/cpp-class-comments.td
@@ -36,6 +36,7 @@ def A_SomeOp1 : Op<A_Dialect, "some_op1", []>{
 
   let cppNamespace = "OP1";
 // OP: namespace OP1
+// OP-EMPTY:
 // OP-NEXT: /// Some Op1 summary line1
 // OP-NEXT: /// summary line2
 // OP-NEXT: /// Some Op1 description
@@ -97,6 +98,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> {
   let methods = [
   ];
 // ATTR-INTERFACE: namespace mlir::a::traits {
+// ATTR-INTERFACE-EMPTY:
 // ATTR-INTERFACE-NEXT: /// Common trait for all layouts.
 // ATTR-INTERFACE-NEXT: class EncodingTrait;
 }
@@ -104,6 +106,7 @@ def EncodingTrait : AttrInterface<"EncodingTrait"> {
 def SimpleEncodingTrait : AttrInterface<"SimpleEncodingTrait"> {
   let cppNamespace = "a::traits";
 // ATTR-INTERFACE: namespace a::traits {
+// ATTR-INTERFACE-EMPTY:
 // ATTR-INTERFACE-NEXT: class SimpleEncodingTrait;
 }
 
@@ -114,6 +117,7 @@ def SimpleOpInterface : OpInterface<"SimpleOpInterface"> {
     Simple Op Interface description
     }];
 // OP-INTERFACE: namespace a::traits {
+// OP-INTERFACE-EMPTY:
 // OP-INTERFACE-NEXT: /// Simple Op Interface description
 // OP-INTERFACE-NEXT: class SimpleOpInterface;
 }
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index d6b70dc9d1978..e58b7646316fc 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -627,12 +627,16 @@ def testVectorizeChildrenAndApplyPatternsAllAttrs(target):
         disable_transfer_permutation_map_lowering_patterns=True,
         vectorize_nd_extract=True,
         vectorize_padding=True,
+        flatten_1d_depthwise_conv=True,
+        fold_type_extensions_into_contract=True,
     )
     # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsAllAttrs
     # CHECK: transform.sequence
     # CHECK: = transform.structured.vectorize
     # CHECK-SAME: disable_multi_reduction_to_contract_patterns
     # CHECK-SAME: disable_transfer_permutation_map_lowering_patterns
+    # CHECK-SAME: flatten_1d_depthwise_conv
+    # CHECK-SAME: fold_type_extensions_into_contract
     # CHECK-SAME: vectorize_nd_extract
     # CHECK-SAME: vectorize_padding
 
@@ -646,12 +650,16 @@ def testVectorizeChildrenAndApplyPatternsNoAttrs(target):
         disable_transfer_permutation_map_lowering_patterns=False,
         vectorize_nd_extract=False,
         vectorize_padding=False,
+        flatten_1d_depthwise_conv=False,
+        fold_type_extensions_into_contract=False,
     )
     # CHECK-LABEL: TEST: testVectorizeChildrenAndApplyPatternsNoAttrs
     # CHECK: transform.sequence
     # CHECK: = transform.structured.vectorize
     # CHECK-NOT: disable_multi_reduction_to_contract_patterns
     # CHECK-NOT: disable_transfer_permutation_map_lowering_patterns
+    # CHECK-NOT: flatten_1d_depthwise_conv
+    # CHECK-NOT: fold_type_extensions_into_contract
     # CHECK-NOT: vectorize_nd_extract
     # CHECK-NOT: vectorize_padding
 
diff --git a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
index 8ec2e03095423..2a513c3b8cc9b 100644
--- a/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
+++ b/mlir/tools/mlir-tblgen/AttrOrTypeDefGen.cpp
@@ -637,8 +637,10 @@ void DefGen::emitTraitMethods(const InterfaceTrait &trait) {
   for (auto &method : iface.getMethods()) {
     // Don't declare if the method has a body. Or if the method has a default
     // implementation and the def didn't request that it always be declared.
-    if (method.getBody() || (method.getDefaultImplementation() &&
-                             !alwaysDeclared.count(method.getName()))) {
+    if (method.getBody())
+      continue;
+    if (method.getDefaultImplementation() &&
+        !alwaysDeclared.count(method.getName())) {
       genTraitMethodUsingDecl(trait, method);
       continue;
     }
diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py
index 3712a6b9c963d..22774468bb403 100755
--- a/mlir/utils/generate-test-checks.py
+++ b/mlir/utils/generate-test-checks.py
@@ -230,14 +230,11 @@ def process_line(line_chunks, variable_namer, use_ssa_name=False, strict_name_re
 
 
 # Process the source file lines. The source file doesn't have to be .mlir.
-def process_source_lines(source_lines, note, args):
+def process_source_lines(source_lines, args):
     source_split_re = re.compile(args.source_delim_regex)
 
     source_segments = [[]]
     for line in source_lines:
-        # Remove previous note.
-        if line in note:
-            continue
         # Remove previous CHECK lines.
         if line.find(args.check_prefix) != -1:
             continue
@@ -359,9 +356,10 @@ def main():
 
     source_segments = None
     if args.source:
-        source_segments = process_source_lines(
-            [l.rstrip() for l in open(args.source, "r")], autogenerated_note, args
-        )
+        with open(args.source, "r") as f:
+            raw_source = f.read().replace(autogenerated_note, "")
+            raw_source_lines = [l.rstrip() for l in raw_source.splitlines()]
+        source_segments = process_source_lines(raw_source_lines, args)
 
     if args.inplace:
         assert args.output is None
diff --git a/offload/liboffload/API/Device.td b/offload/liboffload/API/Device.td
index 5b54c79d83f9d..e9c154818c4a1 100644
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -29,6 +29,7 @@ def ol_device_info_t : Enum {
     TaggedEtor<"PLATFORM", "ol_platform_handle_t", "the platform associated with the device">,
     TaggedEtor<"NAME", "char[]", "Device name">,
     TaggedEtor<"PRODUCT_NAME", "char[]", "Device user-facing marketing name">,
+    TaggedEtor<"UID", "char[]", "Device UID">,
     TaggedEtor<"VENDOR", "char[]", "Device vendor">,
     TaggedEtor<"DRIVER_VERSION", "char[]", "Driver version">,
     TaggedEtor<"MAX_WORK_GROUP_SIZE", "uint32_t", "Maximum total work group size in work items">,
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 6d22faeb0e57e..84bc414396811 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -147,8 +147,8 @@ llvm::Error ol_platform_impl_t::init() {
     if (llvm::Error Err = Plugin->initDevice(Id))
       return Err;
 
-    auto Device = &Plugin->getDevice(Id);
-    auto Info = Device->obtainInfoImpl();
+    GenericDeviceTy *Device = &Plugin->getDevice(Id);
+    llvm::Expected<InfoTreeNode> Info = Device->obtainInfo();
     if (llvm::Error Err = Info.takeError())
       return Err;
     Devices.emplace_back(std::make_unique<ol_device_impl_t>(Id, Device, *this,
@@ -467,6 +467,7 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
   switch (PropName) {
   case OL_DEVICE_INFO_NAME:
   case OL_DEVICE_INFO_PRODUCT_NAME:
+  case OL_DEVICE_INFO_UID:
   case OL_DEVICE_INFO_VENDOR:
   case OL_DEVICE_INFO_DRIVER_VERSION: {
     // String values
@@ -544,6 +545,8 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
     return Info.writeString("Virtual Host Device");
   case OL_DEVICE_INFO_PRODUCT_NAME:
     return Info.writeString("Virtual Host Device");
+  case OL_DEVICE_INFO_UID:
+    return Info.writeString(GenericPluginTy::getHostDeviceUid());
   case OL_DEVICE_INFO_VENDOR:
     return Info.writeString("Liboffload");
   case OL_DEVICE_INFO_DRIVER_VERSION:
diff --git a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 29cfe78082dbb..ddfa65c76cf2d 100644
--- a/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -72,6 +72,7 @@ typedef enum hsa_amd_agent_info_s {
   HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU = 0xA00A,
   HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU = 0xA00B,
   HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010,
+  HSA_AMD_AGENT_INFO_UUID = 0xA011,
   HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY = 0xA016,
 } hsa_amd_agent_info_t;
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 0b03ef534d273..928c6cd7569e3 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2083,6 +2083,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
     ComputeUnitKind = GPUName;
 
+    // From the ROCm HSA documentation:
+    // Query the UUID of the agent. The value is an Ascii string with a maximum
+    // of 21 chars including NUL. The string value consists of two parts: header
+    // and body. The header identifies the device type (GPU, CPU, DSP) while the
+    // body encodes the UUID as a 16 digit hex string.
+    //
+    // Agents that do not support UUID will return the string "GPU-XX" or
+    // "CPU-XX" or "DSP-XX" depending on their device type.
+    char UUID[24] = {0};
+    if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_UUID, UUID))
+      return Err;
+    if (!StringRef(UUID).ends_with("-XX"))
+      setDeviceUidFromVendorUid(UUID);
+
     // Get the wavefront size.
     uint32_t WavefrontSize = 0;
     if (auto Err = getDeviceAttr(HSA_AGENT_INFO_WAVEFRONT_SIZE, WavefrontSize))
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index f9bff9abd903c..f9dcdea7213fd 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -791,6 +791,9 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// this id is not unique between different plugins; they may overlap.
   int32_t getDeviceId() const { return DeviceId; }
 
+  /// Get the unique identifier of the device.
+  const char *getDeviceUid() const { return DeviceUid.c_str(); }
+
   /// Set the context of the device if needed, before calling device-specific
   /// functions. Plugins may implement this function as a no-op if not needed.
   virtual Error setContext() = 0;
@@ -989,9 +992,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error syncEvent(void *EventPtr);
   virtual Error syncEventImpl(void *EventPtr) = 0;
 
+  /// Obtain information about the device.
+  Expected<InfoTreeNode> obtainInfo();
+  virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
+
   /// Print information about the device.
   Error printInfo();
-  virtual Expected<InfoTreeNode> obtainInfoImpl() = 0;
 
   /// Return true if the device has work that is either queued or currently
   /// running
@@ -1204,6 +1210,14 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// global device id and is not the device id visible to the OpenMP user.
   const int32_t DeviceId;
 
+  /// The unique identifier of the device.
+  /// Per default, the unique identifier of the device is set to the device id,
+  /// combined with the plugin name, since the offload device id may overlap
+  /// between different plugins.
+  std::string DeviceUid;
+  /// Construct the device UID from the vendor (U)UID.
+  void setDeviceUidFromVendorUid(StringRef VendorUid);
+
   /// The default grid values used for this device.
   llvm::omp::GV GridValues;
 
@@ -1290,6 +1304,9 @@ struct GenericPluginTy {
     return UserDeviceIds.at(DeviceId);
   }
 
+  /// Get the UID for the host device.
+  static constexpr const char *getHostDeviceUid() { return "HOST"; }
+
   /// Get the ELF code to recognize the binary image of this plugin.
   virtual uint16_t getMagicElfBits() const = 0;
 
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 36d643b65922d..d7e5a21600abf 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -715,6 +715,10 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
       DeviceId(DeviceId), GridValues(OMPGridValues),
       PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
       PinnedAllocs(*this), RPCServer(nullptr) {
+  // Conservative fall-back to the plugin's device uid for the case that no real
+  // vendor (u)uid will become available later.
+  setDeviceUidFromVendorUid(std::to_string(static_cast<uint64_t>(DeviceId)));
+
 #ifdef OMPT_SUPPORT
   OmptInitialized.store(false);
   // Bind the callbacks to this device's member functions
@@ -1524,15 +1528,22 @@ Error GenericDeviceTy::enqueueHostCall(void (*Callback)(void *), void *UserData,
   return Err;
 }
 
+Expected<InfoTreeNode> GenericDeviceTy::obtainInfo() {
+  auto InfoOrErr = obtainInfoImpl();
+  if (InfoOrErr)
+    InfoOrErr->add("UID", getDeviceUid(), "", DeviceInfo::UID);
+  return InfoOrErr;
+}
+
 Error GenericDeviceTy::printInfo() {
-  auto Info = obtainInfoImpl();
+  auto InfoOrErr = obtainInfo();
 
   // Get the vendor-specific info entries describing the device properties.
-  if (auto Err = Info.takeError())
+  if (auto Err = InfoOrErr.takeError())
     return Err;
 
   // Print all info entries.
-  Info->print();
+  InfoOrErr->print();
 
   return Plugin::success();
 }
@@ -1603,6 +1614,10 @@ Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
   return isAccessiblePtrImpl(Ptr, Size);
 }
 
+void GenericDeviceTy::setDeviceUidFromVendorUid(StringRef VendorUid) {
+  DeviceUid = std::string(Plugin.getName()) + "-" + std::string(VendorUid);
+}
+
 Error GenericPluginTy::init() {
   if (Initialized)
     return Plugin::success();
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index f5b2d074a47e7..e7a1ca38b3c13 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -35,6 +35,7 @@ DLWRAP(cuFuncSetAttribute, 3)
 
 // Device info
 DLWRAP(cuDeviceGetName, 3)
+DLWRAP(cuDeviceGetUuid, 2)
 DLWRAP(cuDeviceTotalMem, 2)
 DLWRAP(cuDriverGetVersion, 1)
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index dec4e33508c62..a470d6df1079d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -33,6 +33,9 @@ typedef struct CUfunc_st *CUfunction;
 typedef void (*CUhostFn)(void *userData);
 typedef struct CUstream_st *CUstream;
 typedef struct CUevent_st *CUevent;
+typedef struct CUuuid_st {
+  char bytes[16];
+} CUuuid;
 
 #define CU_DEVICE_INVALID ((CUdevice)(-2))
 
@@ -301,6 +304,7 @@ CUresult cuFuncSetAttribute(CUfunction, CUfunction_attribute, int);
 
 // Device info
 CUresult cuDeviceGetName(char *, int, CUdevice);
+CUresult cuDeviceGetUuid(CUuuid *, CUdevice);
 CUresult cuDeviceTotalMem(size_t *, CUdevice);
 CUresult cuDriverGetVersion(int *);
 
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index db94f7f2dd995..a9adcc397fb7b 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -25,6 +25,7 @@
 #include "PluginInterface.h"
 #include "Utils/ELF.h"
 
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
@@ -293,6 +294,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
     if (auto Err = Plugin::check(Res, "error in cuDeviceGet: %s"))
       return Err;
 
+    CUuuid UUID = {0};
+    Res = cuDeviceGetUuid(&UUID, Device);
+    if (auto Err = Plugin::check(Res, "error in cuDeviceGetUuid: %s"))
+      return Err;
+    setDeviceUidFromVendorUid(toHex(UUID.bytes, true));
+
     // Query the current flags of the primary context and set its flags if
     // it is inactive.
     unsigned int FormerPrimaryCtxFlags = 0;
diff --git a/offload/tools/deviceinfo/llvm-offload-device-info.cpp b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
index 9b58d67f017ca..42ffb97d6d77c 100644
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -176,6 +176,7 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
       printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_NAME, "Name"));
   OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_PRODUCT_NAME,
                                              "Product Name"));
+  OFFLOAD_ERR(printDeviceValue<const char *>(S, D, OL_DEVICE_INFO_UID, "UID"));
   OFFLOAD_ERR(
       printDeviceValue<ol_device_type_t>(S, D, OL_DEVICE_INFO_TYPE, "Type"));
   OFFLOAD_ERR(printDeviceValue<const char *>(
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
index 8cb0b8065c33e..30eafee026316 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -98,6 +98,16 @@ TEST_P(olGetDeviceInfoTest, SuccessProductName) {
   ASSERT_EQ(std::strlen(Name.data()), Size - 1);
 }
 
+TEST_P(olGetDeviceInfoTest, SuccessUID) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_UID, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> UID;
+  UID.resize(Size);
+  ASSERT_SUCCESS(olGetDeviceInfo(Device, OL_DEVICE_INFO_UID, Size, UID.data()));
+  ASSERT_EQ(std::strlen(UID.data()), Size - 1);
+}
+
 TEST_P(olGetDeviceInfoTest, HostProductName) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_PRODUCT_NAME, &Size));
@@ -109,6 +119,16 @@ TEST_P(olGetDeviceInfoTest, HostProductName) {
   ASSERT_EQ(std::strlen(Name.data()), Size - 1);
 }
 
+TEST_P(olGetDeviceInfoTest, HostUID) {
+  size_t Size = 0;
+  ASSERT_SUCCESS(olGetDeviceInfoSize(Host, OL_DEVICE_INFO_UID, &Size));
+  ASSERT_GT(Size, 0ul);
+  std::vector<char> UID;
+  UID.resize(Size);
+  ASSERT_SUCCESS(olGetDeviceInfo(Host, OL_DEVICE_INFO_UID, Size, UID.data()));
+  ASSERT_EQ(std::strlen(UID.data()), Size - 1);
+}
+
 TEST_P(olGetDeviceInfoTest, SuccessVendor) {
   size_t Size = 0;
   ASSERT_SUCCESS(olGetDeviceInfoSize(Device, OL_DEVICE_INFO_VENDOR, &Size));
diff --git a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
index c4a3c2d5e3c75..79a18c1d133dc 100644
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -32,6 +32,7 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(Platform, ol_platform_handle_t,
                             OL_DEVICE_INFO_PLATFORM);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(Name, OL_DEVICE_INFO_NAME);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(ProductName, OL_DEVICE_INFO_PRODUCT_NAME);
+OL_DEVICE_INFO_SIZE_TEST_NONZERO(UID, OL_DEVICE_INFO_UID);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(Vendor, OL_DEVICE_INFO_VENDOR);
 OL_DEVICE_INFO_SIZE_TEST_NONZERO(DriverVersion, OL_DEVICE_INFO_DRIVER_VERSION);
 OL_DEVICE_INFO_SIZE_TEST_EQ(MaxWorkGroupSize, uint32_t,
diff --git a/openmp/runtime/src/CMakeLists.txt b/openmp/runtime/src/CMakeLists.txt
index 6ac047a833fe5..5dd7f4b33612d 100644
--- a/openmp/runtime/src/CMakeLists.txt
+++ b/openmp/runtime/src/CMakeLists.txt
@@ -254,23 +254,35 @@ set(LIBOMP_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
 
 # Add symbolic links to libomp
 if(NOT WIN32)
-  add_custom_command(TARGET omp POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
-      libgomp${LIBOMP_LIBRARY_SUFFIX}
-    COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
-      libiomp5${LIBOMP_LIBRARY_SUFFIX}
-    WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
-  )
-  if(LIBOMP_ENABLE_SHARED)
-    if(APPLE)
-      set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX})
-    else()
-      set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1)
-    endif()
+  if(AIX)
+    # On AIX, libomp.a is the name for both static and shared objects.
+    set(LIBOMP_AIX_SUFFIX ${CMAKE_STATIC_LIBRARY_SUFFIX})
     add_custom_command(TARGET omp POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libgomp${LIBOMP_AIX_SUFFIX}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink
+        ${LIBOMP_LIB_NAME}${LIBOMP_AIX_SUFFIX} libiomp5${LIBOMP_AIX_SUFFIX}
       WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
     )
+  else()
+    add_custom_command(TARGET omp POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+        libiomp5${LIBOMP_LIBRARY_SUFFIX}
+      COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE}
+        libgomp${LIBOMP_LIBRARY_SUFFIX}
+      WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+    )
+    if(LIBOMP_ENABLE_SHARED)
+      if(APPLE)
+        set(VERSIONED_LIBGOMP_NAME libgomp.1${LIBOMP_LIBRARY_SUFFIX})
+      else()
+        set(VERSIONED_LIBGOMP_NAME libgomp${LIBOMP_LIBRARY_SUFFIX}.1)
+      endif()
+      add_custom_command(TARGET omp POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E create_symlink ${LIBOMP_LIB_FILE} ${VERSIONED_LIBGOMP_NAME}
+        WORKING_DIRECTORY ${LIBOMP_LIBRARY_DIR}
+      )
+    endif()
   endif()
 endif()
 
diff --git a/polly/lib/Support/GICHelper.cpp b/polly/lib/Support/GICHelper.cpp
index 027e0194732f4..948bb6a9b9614 100644
--- a/polly/lib/Support/GICHelper.cpp
+++ b/polly/lib/Support/GICHelper.cpp
@@ -59,7 +59,7 @@ APInt polly::APIntFromVal(__isl_take isl_val *Val) {
   Data = (uint64_t *)malloc(NumChunks * ChunkSize);
   isl_val_get_abs_num_chunks(Val, ChunkSize, Data);
   int NumBits = CHAR_BIT * ChunkSize * NumChunks;
-  APInt A(NumBits, NumChunks, Data);
+  APInt A(NumBits, ArrayRef(Data, NumChunks));
 
   // As isl provides only an interface to obtain data that describes the
   // absolute value of an isl_val, A at this point always contains a positive
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 070700a64a168..0888ebd7a9362 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -237,6 +237,7 @@ struct OptimizerAdditionalInfoTy {
   bool Postopts;
   bool Prevect;
   bool &DepsChanged;
+  IslMaxOperationsGuard &MaxOpGuard;
 };
 
 class ScheduleTreeOptimizer final {
@@ -381,6 +382,8 @@ class ScheduleTreeOptimizer final {
 isl::schedule_node
 ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
                                                int VectorWidth) {
+  if (Node.is_null())
+    return {};
   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
   Node = Node.child(0).child(0);
   isl::union_map SchedRelUMap = Node.get_prefix_schedule_relation();
@@ -391,6 +394,8 @@ ScheduleTreeOptimizer::isolateFullPartialTiles(isl::schedule_node Node,
   isl::union_set IsolateOption = getIsolateOptions(IsolateDomain, 1);
   Node = Node.parent().parent();
   isl::union_set Options = IsolateOption.unite(AtomicOption);
+  if (Node.is_null())
+    return {};
   isl::schedule_node_band Result =
       Node.as<isl::schedule_node_band>().set_ast_build_options(Options);
   return Result;
@@ -411,9 +416,13 @@ struct InsertSimdMarkers final : ScheduleNodeRewriter<InsertSimdMarkers> {
 
 isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
     isl::schedule_node Node, unsigned DimToVectorize, int VectorWidth) {
+  if (Node.is_null())
+    return {};
   assert(isl_schedule_node_get_type(Node.get()) == isl_schedule_node_band);
 
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
+  if (Space.is_null())
+    return {};
   unsigned ScheduleDimensions = unsignedFromIslSize(Space.dim(isl::dim::set));
   assert(DimToVectorize < ScheduleDimensions);
 
@@ -439,12 +448,15 @@ isl::schedule_node ScheduleTreeOptimizer::prevectSchedBand(
   // Sink the inner loop into the smallest possible statements to make them
   // represent a single vector instruction if possible.
   Node = isl::manage(isl_schedule_node_band_sink(Node.release()));
+  if (Node.is_null())
+    return {};
 
   // Add SIMD markers to those vector statements.
   InsertSimdMarkers SimdMarkerInserter;
   Node = SimdMarkerInserter.visit(Node);
 
-  PrevectOpts++;
+  if (!Node.is_null())
+    PrevectOpts++;
   return Node.parent();
 }
 
@@ -535,6 +547,8 @@ ScheduleTreeOptimizer::applyTileBandOpt(isl::schedule_node Node) {
 isl::schedule_node
 ScheduleTreeOptimizer::applyPrevectBandOpt(isl::schedule_node Node) {
   auto Space = isl::manage(isl_schedule_node_band_get_space(Node.get()));
+  if (Space.is_null())
+    return {};
   int Dims = unsignedFromIslSize(Space.dim(isl::dim::set));
 
   for (int i = Dims - 1; i >= 0; i--)
@@ -572,9 +586,14 @@ ScheduleTreeOptimizer::optimizeBand(__isl_take isl_schedule_node *NodeArg,
     Node = applyTileBandOpt(Node);
 
   if (OAI->Prevect) {
+    IslQuotaScope MaxScope = OAI->MaxOpGuard.enter();
+
     // FIXME: Prevectorization requirements are different from those checked by
     // isTileableBandNode.
     Node = applyPrevectBandOpt(Node);
+
+    if (OAI->MaxOpGuard.hasQuotaExceeded() || Node.is_null())
+      return (isl::schedule_node()).release();
   }
 
   return Node.release();
@@ -771,6 +790,10 @@ static void runIslScheduleOptimizer(
     return;
   }
 
+  isl_ctx *Ctx = S.getIslCtx().get();
+  IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut,
+                                   /*AutoEnter=*/false);
+
   // Apply ISL's algorithm only if not overridden by the user. Note that
   // post-rescheduling optimizations (tiling, pattern-based, prevectorization)
   // rely on the coincidence/permutable annotations on schedule tree bands that
@@ -853,8 +876,6 @@ static void runIslScheduleOptimizer(
       IslOuterCoincidence = 0;
     }
 
-    isl_ctx *Ctx = S.getIslCtx().get();
-
     isl_options_set_schedule_outer_coincidence(Ctx, IslOuterCoincidence);
     isl_options_set_schedule_maximize_band_depth(Ctx, IslMaximizeBands);
     isl_options_set_schedule_max_constant_term(Ctx, MaxConstantTerm);
@@ -870,28 +891,20 @@ static void runIslScheduleOptimizer(
     SC = SC.set_coincidence(Validity);
 
     {
-      IslMaxOperationsGuard MaxOpGuard(Ctx, ScheduleComputeOut);
+      IslQuotaScope MaxOpScope = MaxOpGuard.enter();
       Schedule = SC.compute_schedule();
-
-      if (MaxOpGuard.hasQuotaExceeded())
-        POLLY_DEBUG(
-            dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
     }
 
     isl_options_set_on_error(Ctx, OnErrorStatus);
 
-    ScopsRescheduled++;
+    if (!Schedule.is_null())
+      ScopsRescheduled++;
     POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After rescheduling"));
   }
 
   walkScheduleTreeForStatistics(Schedule, 1);
 
-  // In cases the scheduler is not able to optimize the code, we just do not
-  // touch the schedule.
-  if (Schedule.is_null())
-    return;
-
-  if (GreedyFusion) {
+  if (GreedyFusion && !Schedule.is_null()) {
     isl::union_map Validity = D.getDependences(
         Dependences::TYPE_RAW | Dependences::TYPE_WAR | Dependences::TYPE_WAW);
     Schedule = applyGreedyFusion(Schedule, Validity);
@@ -905,14 +918,20 @@ static void runIslScheduleOptimizer(
       /*PatternOpts=*/!HasUserTransformation && PMBasedOpts,
       /*Postopts=*/!HasUserTransformation && EnablePostopts,
       /*Prevect=*/PollyVectorizerChoice != VECTORIZER_NONE,
-      DepsChanged};
-  if (OAI.PatternOpts || OAI.Postopts || OAI.Prevect) {
+      DepsChanged,
+      MaxOpGuard};
+  if (!Schedule.is_null() && (OAI.PatternOpts || OAI.Postopts || OAI.Prevect)) {
     Schedule = ScheduleTreeOptimizer::optimizeSchedule(Schedule, &OAI);
     Schedule = hoistExtensionNodes(Schedule);
     POLLY_DEBUG(printSchedule(dbgs(), Schedule, "After post-optimizations"));
     walkScheduleTreeForStatistics(Schedule, 2);
   }
 
+  if (MaxOpGuard.hasQuotaExceeded()) {
+    POLLY_DEBUG(dbgs() << "Schedule optimizer calculation exceeds ISL quota\n");
+    return;
+  }
+
   // Skip profitability check if user transformation(s) have been applied.
   if (!HasUserTransformation &&
       !ScheduleTreeOptimizer::isProfitableSchedule(S, Schedule))
diff --git a/polly/lib/Transform/ScheduleTreeTransform.cpp b/polly/lib/Transform/ScheduleTreeTransform.cpp
index 3f3630027e6e3..c95c55858f038 100644
--- a/polly/lib/Transform/ScheduleTreeTransform.cpp
+++ b/polly/lib/Transform/ScheduleTreeTransform.cpp
@@ -972,6 +972,9 @@ BandAttr *polly::getBandAttr(isl::schedule_node MarkOrBand) {
 }
 
 isl::schedule polly::hoistExtensionNodes(isl::schedule Sched) {
+  if (Sched.is_null())
+    return {};
+
   // If there is no extension node in the first place, return the original
   // schedule tree.
   if (!containsExtensionNode(Sched))
@@ -1126,6 +1129,8 @@ isl::set polly::getPartialTilePrefixes(isl::set ScheduleRange,
 
 isl::union_set polly::getIsolateOptions(isl::set IsolateDomain,
                                         unsigned OutDimsNum) {
+  if (IsolateDomain.is_null())
+    return {};
   unsigned Dims = unsignedFromIslSize(IsolateDomain.tuple_dim());
   assert(OutDimsNum <= Dims &&
          "The isl::set IsolateDomain is used to describe the range of schedule "
diff --git a/polly/test/ScheduleOptimizer/prevectorization_islbound.ll b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
new file mode 100644
index 0000000000000..0bc3c2cf642e8
--- /dev/null
+++ b/polly/test/ScheduleOptimizer/prevectorization_islbound.ll
@@ -0,0 +1,37 @@
+; RUN: opt %loadNPMPolly -S -polly-vectorizer=stripmine -passes=polly-opt-isl -polly-debug -disable-output < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+define void @ham(ptr %arg, ptr %arg1, i32 %arg2, i32 %arg3, ptr %arg4, i32 %arg5, i32 %arg6) {
+bb:
+  %getelementptr = getelementptr [7 x float], ptr null, i32 0, i32 %arg3
+  br label %bb7
+
+bb7:                                              ; preds = %bb11, %bb
+  %phi = phi i32 [ 0, %bb ], [ %add16, %bb11 ]
+  br label %bb8
+
+bb8:                                              ; preds = %bb8, %bb7
+  %phi9 = phi i32 [ 0, %bb7 ], [ %add, %bb8 ]
+  %getelementptr10 = getelementptr [7 x float], ptr null, i32 0, i32 %phi9
+  store float 0.000000e+00, ptr %getelementptr10, align 4
+  %add = add i32 %phi9, 1
+  %icmp = icmp eq i32 %phi9, 0
+  br i1 %icmp, label %bb8, label %bb11
+
+bb11:                                             ; preds = %bb8
+  %load = load float, ptr %getelementptr, align 4
+  store float %load, ptr %arg4, align 4
+  %getelementptr12 = getelementptr [7 x float], ptr null, i32 0, i32 %arg5
+  %load13 = load float, ptr %getelementptr12, align 4
+  store float %load13, ptr %arg, align 4
+  %getelementptr14 = getelementptr [7 x float], ptr null, i32 0, i32 %arg6
+  %load15 = load float, ptr %getelementptr14, align 4
+  store float %load15, ptr %arg1, align 4
+  %add16 = add i32 %phi, 1
+  %icmp17 = icmp ne i32 %phi, %arg2
+  br i1 %icmp17, label %bb7, label %bb18
+
+bb18:                                             ; preds = %bb11
+  ret void
+}
+; CHECK:Schedule optimizer calculation exceeds ISL quota
diff --git a/runtimes/cmake/Modules/HandleLibC.cmake b/runtimes/cmake/Modules/HandleLibC.cmake
index 01da5b260d3d4..f8869512f99d3 100644
--- a/runtimes/cmake/Modules/HandleLibC.cmake
+++ b/runtimes/cmake/Modules/HandleLibC.cmake
@@ -30,7 +30,9 @@ elseif (RUNTIMES_USE_LIBC STREQUAL "llvm-libc")
   check_cxx_compiler_flag(-nostdlibinc CXX_SUPPORTS_NOSTDLIBINC_FLAG)
   if(CXX_SUPPORTS_NOSTDLIBINC_FLAG)
     target_compile_options(runtimes-libc-headers INTERFACE "-nostdlibinc")
-    target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}")
+    if(LIBC_KERNEL_HEADERS)
+      target_compile_options(runtimes-libc-headers INTERFACE "-idirafter${LIBC_KERNEL_HEADERS}")
+    endif()
   endif()
 
   add_library(runtimes-libc-static INTERFACE)
diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index ae9ff2878e03b..3ded8bdf89514 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -80,6 +80,7 @@ cc_library(
     hdrs = glob(["LanguageRuntime/CPlusPlus/*.h"]),
     includes = [".."],
     deps = [
+        "//clang:codegen",
         "//lldb:CoreHeaders",
         "//lldb:Headers",
         "//lldb:SymbolHeaders",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index bcee6c4e243a4..ad013035251f5 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2551,6 +2551,7 @@ llvm_target_lib_list = [lib for lib in [
             "lib/Target/WebAssembly/WebAssemblyGenRegisterInfo.inc": ["-gen-register-info"],
             "lib/Target/WebAssembly/WebAssemblyGenSubtargetInfo.inc": ["-gen-subtarget"],
             "lib/Target/WebAssembly/WebAssemblyGenAsmMatcher.inc": ["-gen-asm-matcher"],
+            "lib/Target/WebAssembly/WebAssemblyGenSDNodeInfo.inc": ["-gen-sd-node-info"],
         },
     },
     {